82eeed85a1b5742a4901bfe5a8c33912890debb6
[alexxy/gromacs.git] / src / gromacs / ewald / pme_gather.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
5  * Copyright (c) 2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36
37 /*! \internal \file
38  *  \brief Implements PME force gathering in CUDA.
39  *
40  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
41  */
42
43 #include "gmxpre.h"
44
45 #include <cassert>
46
47 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
48 #include "gromacs/gpu_utils/typecasts.cuh"
49
50 #include "pme.cuh"
51 #include "pme_gpu_calculate_splines.cuh"
52 #include "pme_grid.h"
53
54 /*! \brief
55  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
56  */
57 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
58 {
59     switch (dimIndex)
60     {
61         case XX: return realGridSizeFP[XX];
62         case YY: return realGridSizeFP[YY];
63         case ZZ: return realGridSizeFP[ZZ];
64     }
65     assert(false);
66     return 0.0f;
67 }
68
69 /*! \brief Reduce the partial force contributions.
70  *
71  * \tparam[in] order              The PME order (must be 4).
72  * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently
73  *                                order^2 == 16)
74  * \tparam[in] blockSize          The CUDA block size
75  *
76  * \param[out] sm_forces          Shared memory array with the output forces (number of elements
77  *                                is number of atoms per block)
78  * \param[in]  atomIndexLocal     Local atom index
79  * \param[in]  splineIndex        Spline index
80  * \param[in]  lineIndex          Line index (same as threadLocalId)
81  * \param[in]  realGridSizeFP     Local grid size constant
82  * \param[in]  fx                 Input force partial component X
83  * \param[in]  fy                 Input force partial component Y
84  * \param[in]  fz                 Input force partial component Z
85  */
86 template<int order, int atomDataSize, int blockSize>
87 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
88                                                    const int    atomIndexLocal,
89                                                    const int    splineIndex,
90                                                    const int    lineIndex,
91                                                    const float* realGridSizeFP,
92                                                    float&       fx,
93                                                    float&       fy,
94                                                    float&       fz)
95 {
96     if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
97     {
98         const unsigned int activeMask = c_fullWarpMask;
99
100         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
101         // TODO: find out if this is the best in terms of transactions count
102         static_assert(order == 4, "Only order of 4 is implemented");
103         static_assert(atomDataSize <= warp_size,
104                       "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
105         const int width = atomDataSize;
106
107         fx += __shfl_down_sync(activeMask, fx, 1, width);
108         fy += __shfl_up_sync(activeMask, fy, 1, width);
109         fz += __shfl_down_sync(activeMask, fz, 1, width);
110
111         if (splineIndex & 1)
112         {
113             fx = fy;
114         }
115
116         fx += __shfl_down_sync(activeMask, fx, 2, width);
117         fz += __shfl_up_sync(activeMask, fz, 2, width);
118
119         if (splineIndex & 2)
120         {
121             fx = fz;
122         }
123
124         // By now fx contains intermediate quad sums of all 3 components:
125         // splineIndex    0            1            2 and 3      4            5            6 and 7 8...
126         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7 etc.
127
128         // We have to just further reduce those groups of 4
129         for (int delta = 4; delta < atomDataSize; delta <<= 1)
130         {
131             fx += __shfl_down_sync(activeMask, fx, delta, width);
132         }
133
134         const int dimIndex = splineIndex;
135         if (dimIndex < DIM)
136         {
137             const float n = read_grid_size(realGridSizeFP, dimIndex);
138             *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
139         }
140     }
141     else
142     {
143         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
144         // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
145         const int         smemPerDim   = warp_size;
146         const int         smemReserved = (DIM)*smemPerDim;
147         __shared__ float  sm_forceReduction[smemReserved + blockSize];
148         __shared__ float* sm_forceTemp[DIM];
149
150         const int numWarps = blockSize / smemPerDim;
151         const int minStride =
152                 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
153
154 #pragma unroll
155         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
156         {
157             int elementIndex = smemReserved + lineIndex;
158             // Store input force contributions
159             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
160             // sync here because two warps write data that the first one consumes below
161             __syncthreads();
162             // Reduce to fit into smemPerDim (warp size)
163 #pragma unroll
164             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
165             {
166                 if (splineIndex < redStride)
167                 {
168                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
169                 }
170             }
171             __syncthreads();
172             // Last iteration - packing everything to be nearby, storing convenience pointer
173             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
174             int redStride          = minStride;
175             if (splineIndex < redStride)
176             {
177                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
178                 sm_forceTemp[dimIndex][packedIndex] =
179                         sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
180             }
181             __syncthreads();
182         }
183
184         assert((blockSize / warp_size) >= DIM);
185         // assert (atomsPerBlock <= warp_size);
186
187         const int warpIndex = lineIndex / warp_size;
188         const int dimIndex  = warpIndex;
189
190         // First 3 warps can now process 1 dimension each
191         if (dimIndex < DIM)
192         {
193             int sourceIndex = lineIndex % warp_size;
194 #pragma unroll
195             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
196             {
197                 if (!(splineIndex & redStride))
198                 {
199                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
200                 }
201             }
202
203             __syncwarp();
204
205             const float n         = read_grid_size(realGridSizeFP, dimIndex);
206             const int   atomIndex = sourceIndex / minStride;
207
208             if (sourceIndex == minStride * atomIndex)
209             {
210                 *((float*)(&sm_forces[atomIndex]) + dimIndex) =
211                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
212             }
213         }
214     }
215 }
216
217 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
218  *
219  * \tparam[in] order              The PME order (must be 4).
220  * \tparam[in] atomsPerWarp       The number of atoms per GPU warp.
221  * \tparam[in] wrapX              Tells if the grid is wrapped in the X dimension.
222  * \tparam[in] wrapY              Tells if the grid is wrapped in the Y dimension.
223  * \param[out] fx                 The force partial component in the X dimension.
224  * \param[out] fy                 The force partial component in the Y dimension.
225  * \param[out] fz                 The force partial component in the Z dimension.
226  * \param[in] ithyMin             The thread minimum index in the Y dimension.
227  * \param[in] ithyMax             The thread maximum index in the Y dimension.
228  * \param[in] ixBase              The grid line index base value in the X dimension.
229  * \param[in] iz                  The grid line index in the Z dimension.
230  * \param[in] nx                  The grid real size in the X dimension.
231  * \param[in] ny                  The grid real size in the Y dimension.
232  * \param[in] pny                 The padded grid real size in the Y dimension.
233  * \param[in] pnz                 The padded grid real size in the Z dimension.
234  * \param[in] atomIndexLocal      The atom index for this thread.
235  * \param[in] splineIndexBase     The base value of the spline parameter index.
236  * \param[in] tdz                 The theta and dtheta in the Z dimension.
237  * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
238  * \param[in] sm_theta            Shared memory array of atom theta values.
239  * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
240  * \param[in] gm_grid             Global memory array of the grid to use.
241  */
242 template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
243 __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
244                                                    float* __restrict__ fy,
245                                                    float* __restrict__ fz,
246                                                    const int    ithyMin,
247                                                    const int    ithyMax,
248                                                    const int    ixBase,
249                                                    const int    iz,
250                                                    const int    nx,
251                                                    const int    ny,
252                                                    const int    pny,
253                                                    const int    pnz,
254                                                    const int    atomIndexLocal,
255                                                    const int    splineIndexBase,
256                                                    const float2 tdz,
257                                                    const int* __restrict__ sm_gridlineIndices,
258                                                    const float* __restrict__ sm_theta,
259                                                    const float* __restrict__ sm_dtheta,
260                                                    const float* __restrict__ gm_grid)
261 {
262 #pragma unroll
263     for (int ithy = ithyMin; ithy < ithyMax; ithy++)
264     {
265         const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
266         const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
267
268         int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
269         if (wrapY & (iy >= ny))
270         {
271             iy -= ny;
272         }
273         const int constOffset = iy * pnz + iz;
274
275 #pragma unroll
276         for (int ithx = 0; (ithx < order); ithx++)
277         {
278             int ix = ixBase + ithx;
279             if (wrapX & (ix >= nx))
280             {
281                 ix -= nx;
282             }
283             const int gridIndexGlobal = ix * pny * pnz + constOffset;
284             assert(gridIndexGlobal >= 0);
285             const float gridValue = gm_grid[gridIndexGlobal];
286             assert(isfinite(gridValue));
287             const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
288             const float2 tdx       = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
289             const float  fxy1      = tdz.x * gridValue;
290             const float  fz1       = tdz.y * gridValue;
291             *fx += tdx.y * tdy.x * fxy1;
292             *fy += tdx.x * tdy.y * fxy1;
293             *fz += tdx.x * tdy.x * fz1;
294         }
295     }
296 }
297
298 /*! \brief Calculate the grid forces and store them in shared memory.
299  *
300  * \param[in,out] sm_forces       Shared memory array with the output forces.
301  * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
302  * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
303  * \param[in] recipBox            The reciprocal box.
304  * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
305  * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
306  * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
307  * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
308  * separate grids are used this should be the coefficients of the grid in question.
309  * \param[in] gm_coefficientsB    Global memory array of the coefficients to use for FEP in state B.
310  * Should be nullptr if two separate grids are used.
311  */
312 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
313                                                             const int   forceIndexLocal,
314                                                             const int   forceIndexGlobal,
315                                                             const float recipBox[DIM][DIM],
316                                                             const float scale,
317                                                             const float* __restrict__ gm_coefficients)
318 {
319     const float3 atomForces     = sm_forces[forceIndexLocal];
320     float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
321     float3       result;
322     result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
323     result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
324     result.z = negCoefficient
325                * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
326                   + recipBox[ZZ][ZZ] * atomForces.z);
327     sm_forces[forceIndexLocal] = result;
328 }
329
330 /*! \brief
331  * A CUDA kernel which gathers the atom forces from the grid.
332  * The grid is assumed to be wrapped in dimension Z.
333  *
334  * \tparam[in] order                The PME order (must be 4 currently).
335  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
336  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
337  * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
338  * \tparam[in] readGlobal           Tells if we should read spline values from global memory
339  * \tparam[in] threadsPerAtom       How many threads work on each atom
340  *
341  * \param[in]  kernelParams         All the PME GPU data.
342  */
343 template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
344 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
345         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
346 {
347     assert(numGrids == 1 || numGrids == 2);
348
349     /* Global memory pointers */
350     const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
351     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
352     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
353     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
354     float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
355
356     /* Global memory pointers for readGlobal */
357     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
358     const float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
359     const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
360
361     float3 atomX;
362     float  atomCharge;
363
364     const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
365
366     /* Number of data components and threads for a single atom */
367     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
368     const int atomDataSize        = threadsPerAtomValue;
369     const int atomsPerBlock       = c_gatherMaxThreadsPerBlock / atomDataSize;
370     // Number of atoms processed by a single warp in spread and gather
371     const int atomsPerWarp = warp_size / atomDataSize;
372
373     const int blockSize = atomsPerBlock * atomDataSize;
374     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
375
376     /* These are the atom indices - for the shared and global memory */
377     const int atomIndexLocal  = threadIdx.z;
378     const int atomIndexOffset = blockIndex * atomsPerBlock;
379     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
380
381     /* Early return for fully empty blocks at the end
382      * (should only happen for billions of input atoms)
383      */
384     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
385     {
386         return;
387     }
388     // 4 warps per block, 8 atoms per warp *3 *4
389     const int        splineParamsSize    = atomsPerBlock * DIM * order;
390     const int        gridlineIndicesSize = atomsPerBlock * DIM;
391     __shared__ int   sm_gridlineIndices[gridlineIndicesSize];
392     __shared__ float sm_theta[splineParamsSize];
393     __shared__ float sm_dtheta[splineParamsSize];
394
395     /* Spline Z coordinates */
396     const int ithz = threadIdx.x;
397
398     /* These are the spline contribution indices in shared memory */
399     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
400     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y))
401                           + splineIndex; /* And to all the block's particles */
402
403     const int threadLocalId =
404             (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
405     const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
406
407     if (readGlobal)
408     {
409         /* Read splines */
410         const int localGridlineIndicesIndex = threadLocalId;
411         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
412         if (localGridlineIndicesIndex < gridlineIndicesSize)
413         {
414             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
415             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
416         }
417         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
418            with order*order threads per atom, it is only required for each thread to load one data value */
419
420         const int iMin = 0;
421         const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
422
423         for (int i = iMin; i < iMax; i++)
424         {
425             int localSplineParamsIndex =
426                     threadLocalId
427                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
428             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
429             if (localSplineParamsIndex < splineParamsSize)
430             {
431                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
432                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
433                 assert(isfinite(sm_theta[localSplineParamsIndex]));
434                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
435             }
436         }
437         __syncthreads();
438     }
439     else
440     {
441         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
442         /* Recalculate  Splines  */
443         if (c_useAtomDataPrefetch)
444         {
445             // charges
446             __shared__ float sm_coefficients[atomsPerBlock];
447             // Coordinates
448             __shared__ float3 sm_coordinates[atomsPerBlock];
449             /* Staging coefficients/charges */
450             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
451
452             /* Staging coordinates */
453             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
454             __syncthreads();
455             atomX      = sm_coordinates[atomIndexLocal];
456             atomCharge = sm_coefficients[atomIndexLocal];
457         }
458         else
459         {
460             atomX      = gm_coordinates[atomIndexGlobal];
461             atomCharge = gm_coefficientsA[atomIndexGlobal];
462         }
463         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
464                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
465         __syncwarp();
466     }
467     float fx = 0.0f;
468     float fy = 0.0f;
469     float fz = 0.0f;
470
471     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
472
473     const int nx  = kernelParams.grid.realGridSize[XX];
474     const int ny  = kernelParams.grid.realGridSize[YY];
475     const int nz  = kernelParams.grid.realGridSize[ZZ];
476     const int pny = kernelParams.grid.realGridSizePadded[YY];
477     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
478
479     const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
480     const int warpIndex     = atomIndexLocal / atomsPerWarp;
481
482     const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
483     const int    splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
484     const float2 tdz          = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
485
486     int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
487     const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
488
489     if (iz >= nz)
490     {
491         iz -= nz;
492     }
493
494     const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
495     const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
496     if (chargeCheck)
497     {
498         sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
499                                                               &fy,
500                                                               &fz,
501                                                               ithyMin,
502                                                               ithyMax,
503                                                               ixBase,
504                                                               iz,
505                                                               nx,
506                                                               ny,
507                                                               pny,
508                                                               pnz,
509                                                               atomIndexLocal,
510                                                               splineIndexBase,
511                                                               tdz,
512                                                               sm_gridlineIndices,
513                                                               sm_theta,
514                                                               sm_dtheta,
515                                                               gm_gridA);
516     }
517     // Reduction of partial force contributions
518     __shared__ float3 sm_forces[atomsPerBlock];
519     reduce_atom_forces<order, atomDataSize, blockSize>(
520             sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
521     __syncthreads();
522
523     /* Calculating the final forces with no component branching, atomsPerBlock threads */
524     const int   forceIndexLocal  = threadLocalId;
525     const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
526     const float scale            = kernelParams.current.scale;
527     if (forceIndexLocal < atomsPerBlock)
528     {
529         calculateAndStoreGridForces(
530                 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
531     }
532
533     __syncwarp();
534     assert(atomsPerBlock <= warp_size);
535
536     /* Writing or adding the final forces component-wise, single warp */
537     const int blockForcesSize = atomsPerBlock * DIM;
538     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
539     const int iterThreads     = blockForcesSize / numIter;
540     if (threadLocalId < iterThreads)
541     {
542 #pragma unroll
543         for (int i = 0; i < numIter; i++)
544         {
545             int   outputIndexLocal       = i * iterThreads + threadLocalId;
546             int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
547             float outputForceComponent   = ((float*)sm_forces)[outputIndexLocal];
548             gm_forces[outputIndexGlobal] = outputForceComponent;
549         }
550     }
551
552     if (numGrids == 2)
553     {
554         /* We must sync here since the same shared memory is used as above. */
555         __syncthreads();
556         fx                    = 0.0f;
557         fy                    = 0.0f;
558         fz                    = 0.0f;
559         const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
560         if (chargeCheck)
561         {
562             sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
563                                                                   &fy,
564                                                                   &fz,
565                                                                   ithyMin,
566                                                                   ithyMax,
567                                                                   ixBase,
568                                                                   iz,
569                                                                   nx,
570                                                                   ny,
571                                                                   pny,
572                                                                   pnz,
573                                                                   atomIndexLocal,
574                                                                   splineIndexBase,
575                                                                   tdz,
576                                                                   sm_gridlineIndices,
577                                                                   sm_theta,
578                                                                   sm_dtheta,
579                                                                   gm_gridB);
580         }
581         // Reduction of partial force contributions
582         reduce_atom_forces<order, atomDataSize, blockSize>(
583                 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
584         __syncthreads();
585
586         /* Calculating the final forces with no component branching, atomsPerBlock threads */
587         if (forceIndexLocal < atomsPerBlock)
588         {
589             calculateAndStoreGridForces(sm_forces,
590                                         forceIndexLocal,
591                                         forceIndexGlobal,
592                                         kernelParams.current.recipBox,
593                                         1.0F - scale,
594                                         gm_coefficientsB);
595         }
596
597         __syncwarp();
598
599         /* Writing or adding the final forces component-wise, single warp */
600         if (threadLocalId < iterThreads)
601         {
602 #pragma unroll
603             for (int i = 0; i < numIter; i++)
604             {
605                 int   outputIndexLocal     = i * iterThreads + threadLocalId;
606                 int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
607                 float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
608                 gm_forces[outputIndexGlobal] += outputForceComponent;
609             }
610         }
611     }
612 }
613
614 //! Kernel instantiations
615 // clang-format off
616 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
617 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
618 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
619 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
620 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
621 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
622 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
623 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
624 // clang-format on