SYCL PME Gather kernel
[alexxy/gromacs.git] / src / gromacs / ewald / pme_gather.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
5  * Copyright (c) 2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36
37 /*! \internal \file
38  *  \brief Implements PME force gathering in CUDA.
39  *
40  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
41  */
42
43 #include "gmxpre.h"
44
45 #include <cassert>
46
47 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
48 #include "gromacs/gpu_utils/typecasts.cuh"
49
50 #include "pme.cuh"
51 #include "pme_gpu_calculate_splines.cuh"
52 #include "pme_grid.h"
53
54 /*! \brief
55  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
56  */
57 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
58 {
59     switch (dimIndex)
60     {
61         case XX: return realGridSizeFP[XX];
62         case YY: return realGridSizeFP[YY];
63         case ZZ: return realGridSizeFP[ZZ];
64     }
65     assert(false);
66     return 0.0F;
67 }
68
69 /*! \brief Reduce the partial force contributions.
70  *
71  * \tparam     order              The PME order (must be 4).
72  * \tparam     atomDataSize       The number of partial force contributions for each atom (currently
73  *                                order^2 == 16)
74  * \tparam     blockSize          The CUDA block size
75  *
76  * \param[out] sm_forces          Shared memory array with the output forces (number of elements
77  *                                is number of atoms per block)
78  * \param[in]  atomIndexLocal     Local atom index
79  * \param[in]  splineIndex        Spline index
80  * \param[in]  lineIndex          Line index (same as threadLocalId)
81  * \param[in]  realGridSizeFP     Local grid size constant
82  * \param[in]  fx                 Input force partial component X
83  * \param[in]  fy                 Input force partial component Y
84  * \param[in]  fz                 Input force partial component Z
85  */
86 template<int order, int atomDataSize, int blockSize>
87 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
88                                                    const int    atomIndexLocal,
89                                                    const int    splineIndex,
90                                                    const int    lineIndex,
91                                                    const float* realGridSizeFP,
92                                                    float& fx, // NOLINT(google-runtime-references)
93                                                    float& fy, // NOLINT(google-runtime-references)
94                                                    float& fz) // NOLINT(google-runtime-references)
95 {
96     if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
97     {
98         const unsigned int activeMask = c_fullWarpMask;
99
100         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
101         // TODO: find out if this is the best in terms of transactions count
102         static_assert(order == 4, "Only order of 4 is implemented");
103         static_assert(atomDataSize <= warp_size,
104                       "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
105         const int width = atomDataSize;
106
107         fx += __shfl_down_sync(activeMask, fx, 1, width);
108         fy += __shfl_up_sync(activeMask, fy, 1, width);
109         fz += __shfl_down_sync(activeMask, fz, 1, width);
110
111         if (splineIndex & 1)
112         {
113             fx = fy;
114         }
115
116         fx += __shfl_down_sync(activeMask, fx, 2, width);
117         fz += __shfl_up_sync(activeMask, fz, 2, width);
118
119         if (splineIndex & 2)
120         {
121             fx = fz;
122         }
123
124         // By now fx contains intermediate quad sums of all 3 components:
125         // splineIndex    0            1            2 and 3      4            5            6 and 7 8...
126         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7 etc.
127
128         // We have to just further reduce those groups of 4
129         for (int delta = 4; delta < atomDataSize; delta <<= 1)
130         {
131             fx += __shfl_down_sync(activeMask, fx, delta, width);
132         }
133
134         const int dimIndex = splineIndex;
135         if (dimIndex < DIM)
136         {
137             const float n = read_grid_size(realGridSizeFP, dimIndex);
138             float* __restrict__ sm_forcesAtomIndexOffset =
139                     reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
140             sm_forcesAtomIndexOffset[dimIndex] = fx * n;
141         }
142     }
143     else
144     {
145         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
146         // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
147         const int         smemPerDim   = warp_size;
148         const int         smemReserved = (DIM)*smemPerDim;
149         __shared__ float  sm_forceReduction[smemReserved + blockSize];
150         __shared__ float* sm_forceTemp[DIM];
151
152         const int numWarps = blockSize / smemPerDim;
153         const int minStride =
154                 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
155
156 #pragma unroll
157         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
158         {
159             int elementIndex = smemReserved + lineIndex;
160             // Store input force contributions
161             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
162             // sync here because two warps write data that the first one consumes below
163             __syncthreads();
164             // Reduce to fit into smemPerDim (warp size)
165 #pragma unroll
166             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
167             {
168                 if (splineIndex < redStride)
169                 {
170                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
171                 }
172             }
173             __syncthreads();
174             // Last iteration - packing everything to be nearby, storing convenience pointer
175             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
176             int redStride          = minStride;
177             if (splineIndex < redStride)
178             {
179                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
180                 sm_forceTemp[dimIndex][packedIndex] =
181                         sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
182             }
183             __syncthreads();
184         }
185
186         assert((blockSize / warp_size) >= DIM);
187         // assert (atomsPerBlock <= warp_size);
188
189         const int warpIndex = lineIndex / warp_size;
190         const int dimIndex  = warpIndex;
191
192         // First 3 warps can now process 1 dimension each
193         if (dimIndex < DIM)
194         {
195             int sourceIndex = lineIndex % warp_size;
196 #pragma unroll
197             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
198             {
199                 if (!(splineIndex & redStride))
200                 {
201                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
202                 }
203             }
204
205             __syncwarp();
206
207             const float n         = read_grid_size(realGridSizeFP, dimIndex);
208             const int   atomIndex = sourceIndex / minStride;
209
210             if (sourceIndex == minStride * atomIndex)
211             {
212                 float* __restrict__ sm_forcesAtomIndexOffset =
213                         reinterpret_cast<float*>(&sm_forces[atomIndex]);
214                 sm_forcesAtomIndexOffset[dimIndex] =
215                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
216             }
217         }
218     }
219 }
220
221 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
222  *
223  * \tparam     order              The PME order (must be 4).
224  * \tparam     atomsPerWarp       The number of atoms per GPU warp.
225  * \tparam     wrapX              Tells if the grid is wrapped in the X dimension.
226  * \tparam     wrapY              Tells if the grid is wrapped in the Y dimension.
227  * \param[out] fx                 The force partial component in the X dimension.
228  * \param[out] fy                 The force partial component in the Y dimension.
229  * \param[out] fz                 The force partial component in the Z dimension.
230  * \param[in] ithyMin             The thread minimum index in the Y dimension.
231  * \param[in] ithyMax             The thread maximum index in the Y dimension.
232  * \param[in] ixBase              The grid line index base value in the X dimension.
233  * \param[in] iz                  The grid line index in the Z dimension.
234  * \param[in] nx                  The grid real size in the X dimension.
235  * \param[in] ny                  The grid real size in the Y dimension.
236  * \param[in] pny                 The padded grid real size in the Y dimension.
237  * \param[in] pnz                 The padded grid real size in the Z dimension.
238  * \param[in] atomIndexLocal      The atom index for this thread.
239  * \param[in] splineIndexBase     The base value of the spline parameter index.
240  * \param[in] tdz                 The theta and dtheta in the Z dimension.
241  * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
242  * \param[in] sm_theta            Shared memory array of atom theta values.
243  * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
244  * \param[in] gm_grid             Global memory array of the grid to use.
245  */
246 template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
247 __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
248                                                    float* __restrict__ fy,
249                                                    float* __restrict__ fz,
250                                                    const int    ithyMin,
251                                                    const int    ithyMax,
252                                                    const int    ixBase,
253                                                    const int    iz,
254                                                    const int    nx,
255                                                    const int    ny,
256                                                    const int    pny,
257                                                    const int    pnz,
258                                                    const int    atomIndexLocal,
259                                                    const int    splineIndexBase,
260                                                    const float2 tdz,
261                                                    const int* __restrict__ sm_gridlineIndices,
262                                                    const float* __restrict__ sm_theta,
263                                                    const float* __restrict__ sm_dtheta,
264                                                    const float* __restrict__ gm_grid)
265 {
266 #pragma unroll
267     for (int ithy = ithyMin; ithy < ithyMax; ithy++)
268     {
269         const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
270         const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
271
272         int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
273         if (wrapY & (iy >= ny))
274         {
275             iy -= ny;
276         }
277         const int constOffset = iy * pnz + iz;
278
279 #pragma unroll
280         for (int ithx = 0; (ithx < order); ithx++)
281         {
282             int ix = ixBase + ithx;
283             if (wrapX & (ix >= nx))
284             {
285                 ix -= nx;
286             }
287             const int gridIndexGlobal = ix * pny * pnz + constOffset;
288             assert(gridIndexGlobal >= 0);
289             const float gridValue = gm_grid[gridIndexGlobal];
290             assert(isfinite(gridValue));
291             const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
292             const float2 tdx       = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
293             const float  fxy1      = tdz.x * gridValue;
294             const float  fz1       = tdz.y * gridValue;
295             *fx += tdx.y * tdy.x * fxy1;
296             *fy += tdx.x * tdy.y * fxy1;
297             *fz += tdx.x * tdy.x * fz1;
298         }
299     }
300 }
301
302 /*! \brief Calculate the grid forces and store them in shared memory.
303  *
304  * \param[in,out] sm_forces       Shared memory array with the output forces.
305  * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
306  * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
307  * \param[in] recipBox            The reciprocal box.
308  * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
309  * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
310  * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
311  * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
312  * separate grids are used this should be the coefficients of the grid in question.
313  */
314 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
315                                                             const int   forceIndexLocal,
316                                                             const int   forceIndexGlobal,
317                                                             const float recipBox[DIM][DIM],
318                                                             const float scale,
319                                                             const float* __restrict__ gm_coefficients)
320 {
321     const float3 atomForces     = sm_forces[forceIndexLocal];
322     float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
323     float3       result;
324     result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
325     result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
326     result.z = negCoefficient
327                * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
328                   + recipBox[ZZ][ZZ] * atomForces.z);
329     sm_forces[forceIndexLocal] = result;
330 }
331
332 /*! \brief
333  * A CUDA kernel which gathers the atom forces from the grid.
334  * The grid is assumed to be wrapped in dimension Z.
335  *
336  * \tparam     order                The PME order (must be 4 currently).
337  * \tparam     wrapX                Tells if the grid is wrapped in the X dimension.
338  * \tparam     wrapY                Tells if the grid is wrapped in the Y dimension.
339  * \tparam     numGrids             The number of grids to use in the kernel. Can be 1 or 2.
340  * \tparam     readGlobal           Tells if we should read spline values from global memory
341  * \tparam     threadsPerAtom       How many threads work on each atom
342  *
343  * \param[in]  kernelParams         All the PME GPU data.
344  */
345 template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
346 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
347         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
348 {
349     assert(numGrids == 1 || numGrids == 2);
350
351     /* Global memory pointers */
352     const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
353     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
354     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
355     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
356     static_assert(sizeof(*kernelParams.atoms.d_forces) == 3 * sizeof(float));
357     float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
358
359     /* Global memory pointers for readGlobal */
360     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
361     const float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
362     const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
363
364     float3 atomX;
365     float  atomCharge;
366
367     const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
368
369     /* Number of data components and threads for a single atom */
370     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
371     const int atomDataSize        = threadsPerAtomValue;
372     const int atomsPerBlock       = c_gatherMaxThreadsPerBlock / atomDataSize;
373     // Number of atoms processed by a single warp in spread and gather
374     const int atomsPerWarp = warp_size / atomDataSize;
375
376     const int blockSize = atomsPerBlock * atomDataSize;
377     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
378
379     /* These are the atom indices - for the shared and global memory */
380     const int atomIndexLocal  = threadIdx.z;
381     const int atomIndexOffset = blockIndex * atomsPerBlock;
382     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
383
384     /* Early return for fully empty blocks at the end
385      * (should only happen for billions of input atoms)
386      */
387     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
388     {
389         return;
390     }
391     // 4 warps per block, 8 atoms per warp *3 *4
392     const int        splineParamsSize    = atomsPerBlock * DIM * order;
393     const int        gridlineIndicesSize = atomsPerBlock * DIM;
394     __shared__ int   sm_gridlineIndices[gridlineIndicesSize];
395     __shared__ float sm_theta[splineParamsSize];
396     __shared__ float sm_dtheta[splineParamsSize];
397
398     /* Spline Z coordinates */
399     const int ithz = threadIdx.x;
400
401     /* These are the spline contribution indices in shared memory */
402     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
403     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y))
404                           + splineIndex; /* And to all the block's particles */
405
406     const int threadLocalId =
407             (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
408     const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
409
410     if (readGlobal)
411     {
412         /* Read splines */
413         const int localGridlineIndicesIndex = threadLocalId;
414         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
415         if (localGridlineIndicesIndex < gridlineIndicesSize)
416         {
417             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
418             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
419         }
420         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
421            with order*order threads per atom, it is only required for each thread to load one data value */
422
423         const int iMin = 0;
424         const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
425
426         for (int i = iMin; i < iMax; i++)
427         {
428             int localSplineParamsIndex =
429                     threadLocalId
430                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
431             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
432             if (localSplineParamsIndex < splineParamsSize)
433             {
434                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
435                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
436                 assert(isfinite(sm_theta[localSplineParamsIndex]));
437                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
438             }
439         }
440         __syncthreads();
441     }
442     else
443     {
444         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
445         /* Recalculate  Splines  */
446         if (c_useAtomDataPrefetch)
447         {
448             // charges
449             __shared__ float sm_coefficients[atomsPerBlock];
450             // Coordinates
451             __shared__ float3 sm_coordinates[atomsPerBlock];
452             /* Staging coefficients/charges */
453             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
454
455             /* Staging coordinates */
456             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
457             __syncthreads();
458             atomX      = sm_coordinates[atomIndexLocal];
459             atomCharge = sm_coefficients[atomIndexLocal];
460         }
461         else
462         {
463             atomX      = gm_coordinates[atomIndexGlobal];
464             atomCharge = gm_coefficientsA[atomIndexGlobal];
465         }
466         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids>(
467                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
468         __syncwarp();
469     }
470     float fx = 0.0F;
471     float fy = 0.0F;
472     float fz = 0.0F;
473
474     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
475
476     const int nx  = kernelParams.grid.realGridSize[XX];
477     const int ny  = kernelParams.grid.realGridSize[YY];
478     const int nz  = kernelParams.grid.realGridSize[ZZ];
479     const int pny = kernelParams.grid.realGridSizePadded[YY];
480     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
481
482     const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
483     const int warpIndex     = atomIndexLocal / atomsPerWarp;
484
485     const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
486     const int    splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
487     const float2 tdz          = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
488
489     int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
490     const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
491
492     if (iz >= nz)
493     {
494         iz -= nz;
495     }
496
497     const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
498     const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
499     if (chargeCheck)
500     {
501         sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
502                                                               &fy,
503                                                               &fz,
504                                                               ithyMin,
505                                                               ithyMax,
506                                                               ixBase,
507                                                               iz,
508                                                               nx,
509                                                               ny,
510                                                               pny,
511                                                               pnz,
512                                                               atomIndexLocal,
513                                                               splineIndexBase,
514                                                               tdz,
515                                                               sm_gridlineIndices,
516                                                               sm_theta,
517                                                               sm_dtheta,
518                                                               gm_gridA);
519     }
520     // Reduction of partial force contributions
521     __shared__ float3 sm_forces[atomsPerBlock];
522     reduce_atom_forces<order, atomDataSize, blockSize>(
523             sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
524     __syncthreads();
525
526     /* Calculating the final forces with no component branching, atomsPerBlock threads */
527     const int   forceIndexLocal  = threadLocalId;
528     const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
529     const float scale            = kernelParams.current.scale;
530     if (forceIndexLocal < atomsPerBlock)
531     {
532         calculateAndStoreGridForces(
533                 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
534     }
535
536     __syncwarp();
537     assert(atomsPerBlock <= warp_size);
538
539     /* Writing or adding the final forces component-wise, single warp */
540     const int blockForcesSize = atomsPerBlock * DIM;
541     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
542     const int iterThreads     = blockForcesSize / numIter;
543     if (threadLocalId < iterThreads)
544     {
545 #pragma unroll
546         for (int i = 0; i < numIter; i++)
547         {
548             int   outputIndexLocal       = i * iterThreads + threadLocalId;
549             int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
550             float outputForceComponent   = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
551             gm_forces[outputIndexGlobal] = outputForceComponent;
552         }
553     }
554
555     if (numGrids == 2)
556     {
557         /* We must sync here since the same shared memory is used as above. */
558         __syncthreads();
559         fx                    = 0.0F;
560         fy                    = 0.0F;
561         fz                    = 0.0F;
562         const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
563         if (chargeCheck)
564         {
565             sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
566                                                                   &fy,
567                                                                   &fz,
568                                                                   ithyMin,
569                                                                   ithyMax,
570                                                                   ixBase,
571                                                                   iz,
572                                                                   nx,
573                                                                   ny,
574                                                                   pny,
575                                                                   pnz,
576                                                                   atomIndexLocal,
577                                                                   splineIndexBase,
578                                                                   tdz,
579                                                                   sm_gridlineIndices,
580                                                                   sm_theta,
581                                                                   sm_dtheta,
582                                                                   gm_gridB);
583         }
584         // Reduction of partial force contributions
585         reduce_atom_forces<order, atomDataSize, blockSize>(
586                 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
587         __syncthreads();
588
589         /* Calculating the final forces with no component branching, atomsPerBlock threads */
590         if (forceIndexLocal < atomsPerBlock)
591         {
592             calculateAndStoreGridForces(sm_forces,
593                                         forceIndexLocal,
594                                         forceIndexGlobal,
595                                         kernelParams.current.recipBox,
596                                         1.0F - scale,
597                                         gm_coefficientsB);
598         }
599
600         __syncwarp();
601
602         /* Writing or adding the final forces component-wise, single warp */
603         if (threadLocalId < iterThreads)
604         {
605 #pragma unroll
606             for (int i = 0; i < numIter; i++)
607             {
608                 int   outputIndexLocal     = i * iterThreads + threadLocalId;
609                 int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
610                 float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
611                 gm_forces[outputIndexGlobal] += outputForceComponent;
612             }
613         }
614     }
615 }
616
617 //! Kernel instantiations
618 // clang-format off
619 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
620 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
621 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
622 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
623 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
624 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
625 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
626 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
627 // clang-format on