28de3817e5fcd6a4fc004e27cfdd1d9f32199e26
[alexxy/gromacs.git] / src / gromacs / ewald / pme_gather.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
5  * Copyright (c) 2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36
37 /*! \internal \file
38  *  \brief Implements PME force gathering in CUDA.
39  *
40  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
41  */
42
43 #include "gmxpre.h"
44
45 #include <cassert>
46
47 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
48 #include "gromacs/gpu_utils/typecasts.cuh"
49
50 #include "pme.cuh"
51 #include "pme_gpu_calculate_splines.cuh"
52 #include "pme_grid.h"
53
54 /*! \brief
55  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
56  */
57 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
58 {
59     switch (dimIndex)
60     {
61         case XX: return realGridSizeFP[XX];
62         case YY: return realGridSizeFP[YY];
63         case ZZ: return realGridSizeFP[ZZ];
64     }
65     assert(false);
66     return 0.0F;
67 }
68
69 /*! \brief Reduce the partial force contributions.
70  *
71  * \tparam     order              The PME order (must be 4).
72  * \tparam     atomDataSize       The number of partial force contributions for each atom (currently
73  *                                order^2 == 16)
74  * \tparam     blockSize          The CUDA block size
75  *
76  * \param[out] sm_forces          Shared memory array with the output forces (number of elements
77  *                                is number of atoms per block)
78  * \param[in]  atomIndexLocal     Local atom index
79  * \param[in]  splineIndex        Spline index
80  * \param[in]  lineIndex          Line index (same as threadLocalId)
81  * \param[in]  realGridSizeFP     Local grid size constant
82  * \param[in]  fx                 Input force partial component X
83  * \param[in]  fy                 Input force partial component Y
84  * \param[in]  fz                 Input force partial component Z
85  */
86 template<int order, int atomDataSize, int blockSize>
87 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
88                                                    const int    atomIndexLocal,
89                                                    const int    splineIndex,
90                                                    const int    lineIndex,
91                                                    const float* realGridSizeFP,
92                                                    float& fx, // NOLINT(google-runtime-references)
93                                                    float& fy, // NOLINT(google-runtime-references)
94                                                    float& fz) // NOLINT(google-runtime-references)
95 {
96     if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
97     {
98         const unsigned int activeMask = c_fullWarpMask;
99
100         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
101         // TODO: find out if this is the best in terms of transactions count
102         static_assert(order == 4, "Only order of 4 is implemented");
103         static_assert(atomDataSize <= warp_size,
104                       "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
105         const int width = atomDataSize;
106
107         fx += __shfl_down_sync(activeMask, fx, 1, width);
108         fy += __shfl_up_sync(activeMask, fy, 1, width);
109         fz += __shfl_down_sync(activeMask, fz, 1, width);
110
111         if (splineIndex & 1)
112         {
113             fx = fy;
114         }
115
116         fx += __shfl_down_sync(activeMask, fx, 2, width);
117         fz += __shfl_up_sync(activeMask, fz, 2, width);
118
119         if (splineIndex & 2)
120         {
121             fx = fz;
122         }
123
124         // By now fx contains intermediate quad sums of all 3 components:
125         // splineIndex    0            1            2 and 3      4            5            6 and 7 8...
126         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7 etc.
127
128         // We have to just further reduce those groups of 4
129         for (int delta = 4; delta < atomDataSize; delta <<= 1)
130         {
131             fx += __shfl_down_sync(activeMask, fx, delta, width);
132         }
133
134         const int dimIndex = splineIndex;
135         if (dimIndex < DIM)
136         {
137             const float n = read_grid_size(realGridSizeFP, dimIndex);
138             float* __restrict__ sm_forcesAtomIndexOffset =
139                     reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
140             sm_forcesAtomIndexOffset[dimIndex] = fx * n;
141         }
142     }
143     else
144     {
145         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
146         // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
147         const int         smemPerDim   = warp_size;
148         const int         smemReserved = (DIM)*smemPerDim;
149         __shared__ float  sm_forceReduction[smemReserved + blockSize];
150         __shared__ float* sm_forceTemp[DIM];
151
152         const int numWarps = blockSize / smemPerDim;
153         const int minStride =
154                 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
155
156 #pragma unroll
157         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
158         {
159             int elementIndex = smemReserved + lineIndex;
160             // Store input force contributions
161             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
162             // sync here because two warps write data that the first one consumes below
163             __syncthreads();
164             // Reduce to fit into smemPerDim (warp size)
165 #pragma unroll
166             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
167             {
168                 if (splineIndex < redStride)
169                 {
170                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
171                 }
172             }
173             __syncthreads();
174             // Last iteration - packing everything to be nearby, storing convenience pointer
175             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
176             int redStride          = minStride;
177             if (splineIndex < redStride)
178             {
179                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
180                 sm_forceTemp[dimIndex][packedIndex] =
181                         sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
182             }
183             __syncthreads();
184         }
185
186         assert((blockSize / warp_size) >= DIM);
187         // assert (atomsPerBlock <= warp_size);
188
189         const int warpIndex = lineIndex / warp_size;
190         const int dimIndex  = warpIndex;
191
192         // First 3 warps can now process 1 dimension each
193         if (dimIndex < DIM)
194         {
195             int sourceIndex = lineIndex % warp_size;
196 #pragma unroll
197             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
198             {
199                 if (!(splineIndex & redStride))
200                 {
201                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
202                 }
203             }
204
205             __syncwarp();
206
207             const float n         = read_grid_size(realGridSizeFP, dimIndex);
208             const int   atomIndex = sourceIndex / minStride;
209
210             if (sourceIndex == minStride * atomIndex)
211             {
212                 float* __restrict__ sm_forcesAtomIndexOffset =
213                         reinterpret_cast<float*>(&sm_forces[atomIndex]);
214                 sm_forcesAtomIndexOffset[dimIndex] =
215                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
216             }
217         }
218     }
219 }
220
221 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
222  *
223  * \tparam     order              The PME order (must be 4).
224  * \tparam     atomsPerWarp       The number of atoms per GPU warp.
225  * \tparam     wrapX              Tells if the grid is wrapped in the X dimension.
226  * \tparam     wrapY              Tells if the grid is wrapped in the Y dimension.
227  * \param[out] fx                 The force partial component in the X dimension.
228  * \param[out] fy                 The force partial component in the Y dimension.
229  * \param[out] fz                 The force partial component in the Z dimension.
230  * \param[in] ithyMin             The thread minimum index in the Y dimension.
231  * \param[in] ithyMax             The thread maximum index in the Y dimension.
232  * \param[in] ixBase              The grid line index base value in the X dimension.
233  * \param[in] iz                  The grid line index in the Z dimension.
234  * \param[in] nx                  The grid real size in the X dimension.
235  * \param[in] ny                  The grid real size in the Y dimension.
236  * \param[in] pny                 The padded grid real size in the Y dimension.
237  * \param[in] pnz                 The padded grid real size in the Z dimension.
238  * \param[in] atomIndexLocal      The atom index for this thread.
239  * \param[in] splineIndexBase     The base value of the spline parameter index.
240  * \param[in] tdz                 The theta and dtheta in the Z dimension.
241  * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
242  * \param[in] sm_theta            Shared memory array of atom theta values.
243  * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
244  * \param[in] gm_grid             Global memory array of the grid to use.
245  */
246 template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
247 __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
248                                                    float* __restrict__ fy,
249                                                    float* __restrict__ fz,
250                                                    const int    ithyMin,
251                                                    const int    ithyMax,
252                                                    const int    ixBase,
253                                                    const int    iz,
254                                                    const int    nx,
255                                                    const int    ny,
256                                                    const int    pny,
257                                                    const int    pnz,
258                                                    const int    atomIndexLocal,
259                                                    const int    splineIndexBase,
260                                                    const float2 tdz,
261                                                    const int* __restrict__ sm_gridlineIndices,
262                                                    const float* __restrict__ sm_theta,
263                                                    const float* __restrict__ sm_dtheta,
264                                                    const float* __restrict__ gm_grid)
265 {
266 #pragma unroll
267     for (int ithy = ithyMin; ithy < ithyMax; ithy++)
268     {
269         const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
270         const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
271
272         int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
273         if (wrapY & (iy >= ny))
274         {
275             iy -= ny;
276         }
277         const int constOffset = iy * pnz + iz;
278
279 #pragma unroll
280         for (int ithx = 0; (ithx < order); ithx++)
281         {
282             int ix = ixBase + ithx;
283             if (wrapX & (ix >= nx))
284             {
285                 ix -= nx;
286             }
287             const int gridIndexGlobal = ix * pny * pnz + constOffset;
288             assert(gridIndexGlobal >= 0);
289             const float gridValue = gm_grid[gridIndexGlobal];
290             assert(isfinite(gridValue));
291             const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
292             const float2 tdx       = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
293             const float  fxy1      = tdz.x * gridValue;
294             const float  fz1       = tdz.y * gridValue;
295             *fx += tdx.y * tdy.x * fxy1;
296             *fy += tdx.x * tdy.y * fxy1;
297             *fz += tdx.x * tdy.x * fz1;
298         }
299     }
300 }
301
302 /*! \brief Calculate the grid forces and store them in shared memory.
303  *
304  * \param[in,out] sm_forces       Shared memory array with the output forces.
305  * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
306  * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
307  * \param[in] recipBox            The reciprocal box.
308  * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
309  * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
310  * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
311  * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
312  * separate grids are used this should be the coefficients of the grid in question.
313  * \param[in] gm_coefficientsB    Global memory array of the coefficients to use for FEP in state B.
314  * Should be nullptr if two separate grids are used.
315  */
316 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
317                                                             const int   forceIndexLocal,
318                                                             const int   forceIndexGlobal,
319                                                             const float recipBox[DIM][DIM],
320                                                             const float scale,
321                                                             const float* __restrict__ gm_coefficients)
322 {
323     const float3 atomForces     = sm_forces[forceIndexLocal];
324     float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
325     float3       result;
326     result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
327     result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
328     result.z = negCoefficient
329                * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
330                   + recipBox[ZZ][ZZ] * atomForces.z);
331     sm_forces[forceIndexLocal] = result;
332 }
333
334 /*! \brief
335  * A CUDA kernel which gathers the atom forces from the grid.
336  * The grid is assumed to be wrapped in dimension Z.
337  *
338  * \tparam     order                The PME order (must be 4 currently).
339  * \tparam     wrapX                Tells if the grid is wrapped in the X dimension.
340  * \tparam     wrapY                Tells if the grid is wrapped in the Y dimension.
341  * \tparam     numGrids             The number of grids to use in the kernel. Can be 1 or 2.
342  * \tparam     readGlobal           Tells if we should read spline values from global memory
343  * \tparam     threadsPerAtom       How many threads work on each atom
344  *
345  * \param[in]  kernelParams         All the PME GPU data.
346  */
347 template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
348 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
349         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
350 {
351     assert(numGrids == 1 || numGrids == 2);
352
353     /* Global memory pointers */
354     const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
355     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
356     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
357     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
358     static_assert(sizeof(*kernelParams.atoms.d_forces) == 3 * sizeof(float));
359     float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
360
361     /* Global memory pointers for readGlobal */
362     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
363     const float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
364     const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
365
366     float3 atomX;
367     float  atomCharge;
368
369     const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
370
371     /* Number of data components and threads for a single atom */
372     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
373     const int atomDataSize        = threadsPerAtomValue;
374     const int atomsPerBlock       = c_gatherMaxThreadsPerBlock / atomDataSize;
375     // Number of atoms processed by a single warp in spread and gather
376     const int atomsPerWarp = warp_size / atomDataSize;
377
378     const int blockSize = atomsPerBlock * atomDataSize;
379     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
380
381     /* These are the atom indices - for the shared and global memory */
382     const int atomIndexLocal  = threadIdx.z;
383     const int atomIndexOffset = blockIndex * atomsPerBlock;
384     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
385
386     /* Early return for fully empty blocks at the end
387      * (should only happen for billions of input atoms)
388      */
389     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
390     {
391         return;
392     }
393     // 4 warps per block, 8 atoms per warp *3 *4
394     const int        splineParamsSize    = atomsPerBlock * DIM * order;
395     const int        gridlineIndicesSize = atomsPerBlock * DIM;
396     __shared__ int   sm_gridlineIndices[gridlineIndicesSize];
397     __shared__ float sm_theta[splineParamsSize];
398     __shared__ float sm_dtheta[splineParamsSize];
399
400     /* Spline Z coordinates */
401     const int ithz = threadIdx.x;
402
403     /* These are the spline contribution indices in shared memory */
404     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
405     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y))
406                           + splineIndex; /* And to all the block's particles */
407
408     const int threadLocalId =
409             (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
410     const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
411
412     if (readGlobal)
413     {
414         /* Read splines */
415         const int localGridlineIndicesIndex = threadLocalId;
416         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
417         if (localGridlineIndicesIndex < gridlineIndicesSize)
418         {
419             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
420             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
421         }
422         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
423            with order*order threads per atom, it is only required for each thread to load one data value */
424
425         const int iMin = 0;
426         const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
427
428         for (int i = iMin; i < iMax; i++)
429         {
430             int localSplineParamsIndex =
431                     threadLocalId
432                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
433             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
434             if (localSplineParamsIndex < splineParamsSize)
435             {
436                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
437                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
438                 assert(isfinite(sm_theta[localSplineParamsIndex]));
439                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
440             }
441         }
442         __syncthreads();
443     }
444     else
445     {
446         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
447         /* Recalculate  Splines  */
448         if (c_useAtomDataPrefetch)
449         {
450             // charges
451             __shared__ float sm_coefficients[atomsPerBlock];
452             // Coordinates
453             __shared__ float3 sm_coordinates[atomsPerBlock];
454             /* Staging coefficients/charges */
455             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
456
457             /* Staging coordinates */
458             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
459             __syncthreads();
460             atomX      = sm_coordinates[atomIndexLocal];
461             atomCharge = sm_coefficients[atomIndexLocal];
462         }
463         else
464         {
465             atomX      = gm_coordinates[atomIndexGlobal];
466             atomCharge = gm_coefficientsA[atomIndexGlobal];
467         }
468         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids>(
469                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
470         __syncwarp();
471     }
472     float fx = 0.0F;
473     float fy = 0.0F;
474     float fz = 0.0F;
475
476     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
477
478     const int nx  = kernelParams.grid.realGridSize[XX];
479     const int ny  = kernelParams.grid.realGridSize[YY];
480     const int nz  = kernelParams.grid.realGridSize[ZZ];
481     const int pny = kernelParams.grid.realGridSizePadded[YY];
482     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
483
484     const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
485     const int warpIndex     = atomIndexLocal / atomsPerWarp;
486
487     const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
488     const int    splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
489     const float2 tdz          = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
490
491     int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
492     const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
493
494     if (iz >= nz)
495     {
496         iz -= nz;
497     }
498
499     const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
500     const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
501     if (chargeCheck)
502     {
503         sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
504                                                               &fy,
505                                                               &fz,
506                                                               ithyMin,
507                                                               ithyMax,
508                                                               ixBase,
509                                                               iz,
510                                                               nx,
511                                                               ny,
512                                                               pny,
513                                                               pnz,
514                                                               atomIndexLocal,
515                                                               splineIndexBase,
516                                                               tdz,
517                                                               sm_gridlineIndices,
518                                                               sm_theta,
519                                                               sm_dtheta,
520                                                               gm_gridA);
521     }
522     // Reduction of partial force contributions
523     __shared__ float3 sm_forces[atomsPerBlock];
524     reduce_atom_forces<order, atomDataSize, blockSize>(
525             sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
526     __syncthreads();
527
528     /* Calculating the final forces with no component branching, atomsPerBlock threads */
529     const int   forceIndexLocal  = threadLocalId;
530     const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
531     const float scale            = kernelParams.current.scale;
532     if (forceIndexLocal < atomsPerBlock)
533     {
534         calculateAndStoreGridForces(
535                 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
536     }
537
538     __syncwarp();
539     assert(atomsPerBlock <= warp_size);
540
541     /* Writing or adding the final forces component-wise, single warp */
542     const int blockForcesSize = atomsPerBlock * DIM;
543     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
544     const int iterThreads     = blockForcesSize / numIter;
545     if (threadLocalId < iterThreads)
546     {
547 #pragma unroll
548         for (int i = 0; i < numIter; i++)
549         {
550             int   outputIndexLocal       = i * iterThreads + threadLocalId;
551             int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
552             float outputForceComponent   = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
553             gm_forces[outputIndexGlobal] = outputForceComponent;
554         }
555     }
556
557     if (numGrids == 2)
558     {
559         /* We must sync here since the same shared memory is used as above. */
560         __syncthreads();
561         fx                    = 0.0F;
562         fy                    = 0.0F;
563         fz                    = 0.0F;
564         const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
565         if (chargeCheck)
566         {
567             sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
568                                                                   &fy,
569                                                                   &fz,
570                                                                   ithyMin,
571                                                                   ithyMax,
572                                                                   ixBase,
573                                                                   iz,
574                                                                   nx,
575                                                                   ny,
576                                                                   pny,
577                                                                   pnz,
578                                                                   atomIndexLocal,
579                                                                   splineIndexBase,
580                                                                   tdz,
581                                                                   sm_gridlineIndices,
582                                                                   sm_theta,
583                                                                   sm_dtheta,
584                                                                   gm_gridB);
585         }
586         // Reduction of partial force contributions
587         reduce_atom_forces<order, atomDataSize, blockSize>(
588                 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
589         __syncthreads();
590
591         /* Calculating the final forces with no component branching, atomsPerBlock threads */
592         if (forceIndexLocal < atomsPerBlock)
593         {
594             calculateAndStoreGridForces(sm_forces,
595                                         forceIndexLocal,
596                                         forceIndexGlobal,
597                                         kernelParams.current.recipBox,
598                                         1.0F - scale,
599                                         gm_coefficientsB);
600         }
601
602         __syncwarp();
603
604         /* Writing or adding the final forces component-wise, single warp */
605         if (threadLocalId < iterThreads)
606         {
607 #pragma unroll
608             for (int i = 0; i < numIter; i++)
609             {
610                 int   outputIndexLocal     = i * iterThreads + threadLocalId;
611                 int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
612                 float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
613                 gm_forces[outputIndexGlobal] += outputForceComponent;
614             }
615         }
616     }
617 }
618
619 //! Kernel instantiations
620 // clang-format off
621 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
622 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
623 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
624 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
625 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
626 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
627 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
628 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
629 // clang-format on