src/gromacs/ewald/pme_gather.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
   5  * Copyright (c) 2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  *  \brief Implements PME force gathering in CUDA.
  39  *
  40  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include <cassert>
  46
  47 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  48 #include "gromacs/gpu_utils/typecasts.cuh"
  49
  50 #include "pme.cuh"
  51 #include "pme_gpu_calculate_splines.cuh"
  52 #include "pme_grid.h"
  53
  54 /*! \brief
  55  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  56  */
  57 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
  58 {
  59     switch (dimIndex)
  60     {
  61         case XX: return realGridSizeFP[XX];
  62         case YY: return realGridSizeFP[YY];
  63         case ZZ: return realGridSizeFP[ZZ];
  64     }
  65     assert(false);
  66     return 0.0F;
  67 }
  68
  69 /*! \brief Reduce the partial force contributions.
  70  *
  71  * \tparam[in] order              The PME order (must be 4).
  72  * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently
  73  *                                order^2 == 16)
  74  * \tparam[in] blockSize          The CUDA block size
  75  *
  76  * \param[out] sm_forces          Shared memory array with the output forces (number of elements
  77  *                                is number of atoms per block)
  78  * \param[in]  atomIndexLocal     Local atom index
  79  * \param[in]  splineIndex        Spline index
  80  * \param[in]  lineIndex          Line index (same as threadLocalId)
  81  * \param[in]  realGridSizeFP     Local grid size constant
  82  * \param[in]  fx                 Input force partial component X
  83  * \param[in]  fy                 Input force partial component Y
  84  * \param[in]  fz                 Input force partial component Z
  85  */
  86 template<int order, int atomDataSize, int blockSize>
  87 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
  88                                                    const int    atomIndexLocal,
  89                                                    const int    splineIndex,
  90                                                    const int    lineIndex,
  91                                                    const float* realGridSizeFP,
  92                                                    float& fx, // NOLINT(google-runtime-references)
  93                                                    float& fy, // NOLINT(google-runtime-references)
  94                                                    float& fz) // NOLINT(google-runtime-references)
  95 {
  96     if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
  97     {
  98         const unsigned int activeMask = c_fullWarpMask;
  99
 100         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
 101         // TODO: find out if this is the best in terms of transactions count
 102         static_assert(order == 4, "Only order of 4 is implemented");
 103         static_assert(atomDataSize <= warp_size,
 104                       "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
 105         const int width = atomDataSize;
 106
 107         fx += __shfl_down_sync(activeMask, fx, 1, width);
 108         fy += __shfl_up_sync(activeMask, fy, 1, width);
 109         fz += __shfl_down_sync(activeMask, fz, 1, width);
 110
 111         if (splineIndex & 1)
 112         {
 113             fx = fy;
 114         }
 115
 116         fx += __shfl_down_sync(activeMask, fx, 2, width);
 117         fz += __shfl_up_sync(activeMask, fz, 2, width);
 118
 119         if (splineIndex & 2)
 120         {
 121             fx = fz;
 122         }
 123
 124         // By now fx contains intermediate quad sums of all 3 components:
 125         // splineIndex    0            1            2 and 3      4            5            6 and 7 8...
 126         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7 etc.
 127
 128         // We have to just further reduce those groups of 4
 129         for (int delta = 4; delta < atomDataSize; delta <<= 1)
 130         {
 131             fx += __shfl_down_sync(activeMask, fx, delta, width);
 132         }
 133
 134         const int dimIndex = splineIndex;
 135         if (dimIndex < DIM)
 136         {
 137             const float n = read_grid_size(realGridSizeFP, dimIndex);
 138             float* __restrict__ sm_forcesAtomIndexOffset =
 139                     reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
 140             sm_forcesAtomIndexOffset[dimIndex] = fx * n;
 141         }
 142     }
 143     else
 144     {
 145         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
 146         // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
 147         const int         smemPerDim   = warp_size;
 148         const int         smemReserved = (DIM)*smemPerDim;
 149         __shared__ float  sm_forceReduction[smemReserved + blockSize];
 150         __shared__ float* sm_forceTemp[DIM];
 151
 152         const int numWarps = blockSize / smemPerDim;
 153         const int minStride =
 154                 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
 155
 156 #pragma unroll
 157         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 158         {
 159             int elementIndex = smemReserved + lineIndex;
 160             // Store input force contributions
 161             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 162             // sync here because two warps write data that the first one consumes below
 163             __syncthreads();
 164             // Reduce to fit into smemPerDim (warp size)
 165 #pragma unroll
 166             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
 167             {
 168                 if (splineIndex < redStride)
 169                 {
 170                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 171                 }
 172             }
 173             __syncthreads();
 174             // Last iteration - packing everything to be nearby, storing convenience pointer
 175             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 176             int redStride          = minStride;
 177             if (splineIndex < redStride)
 178             {
 179                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
 180                 sm_forceTemp[dimIndex][packedIndex] =
 181                         sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 182             }
 183             __syncthreads();
 184         }
 185
 186         assert((blockSize / warp_size) >= DIM);
 187         // assert (atomsPerBlock <= warp_size);
 188
 189         const int warpIndex = lineIndex / warp_size;
 190         const int dimIndex  = warpIndex;
 191
 192         // First 3 warps can now process 1 dimension each
 193         if (dimIndex < DIM)
 194         {
 195             int sourceIndex = lineIndex % warp_size;
 196 #pragma unroll
 197             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
 198             {
 199                 if (!(splineIndex & redStride))
 200                 {
 201                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 202                 }
 203             }
 204
 205             __syncwarp();
 206
 207             const float n         = read_grid_size(realGridSizeFP, dimIndex);
 208             const int   atomIndex = sourceIndex / minStride;
 209
 210             if (sourceIndex == minStride * atomIndex)
 211             {
 212                 float* __restrict__ sm_forcesAtomIndexOffset =
 213                         reinterpret_cast<float*>(&sm_forces[atomIndex]);
 214                 sm_forcesAtomIndexOffset[dimIndex] =
 215                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 216             }
 217         }
 218     }
 219 }
 220
 221 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
 222  *
 223  * \tparam[in] order              The PME order (must be 4).
 224  * \tparam[in] atomsPerWarp       The number of atoms per GPU warp.
 225  * \tparam[in] wrapX              Tells if the grid is wrapped in the X dimension.
 226  * \tparam[in] wrapY              Tells if the grid is wrapped in the Y dimension.
 227  * \param[out] fx                 The force partial component in the X dimension.
 228  * \param[out] fy                 The force partial component in the Y dimension.
 229  * \param[out] fz                 The force partial component in the Z dimension.
 230  * \param[in] ithyMin             The thread minimum index in the Y dimension.
 231  * \param[in] ithyMax             The thread maximum index in the Y dimension.
 232  * \param[in] ixBase              The grid line index base value in the X dimension.
 233  * \param[in] iz                  The grid line index in the Z dimension.
 234  * \param[in] nx                  The grid real size in the X dimension.
 235  * \param[in] ny                  The grid real size in the Y dimension.
 236  * \param[in] pny                 The padded grid real size in the Y dimension.
 237  * \param[in] pnz                 The padded grid real size in the Z dimension.
 238  * \param[in] atomIndexLocal      The atom index for this thread.
 239  * \param[in] splineIndexBase     The base value of the spline parameter index.
 240  * \param[in] tdz                 The theta and dtheta in the Z dimension.
 241  * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
 242  * \param[in] sm_theta            Shared memory array of atom theta values.
 243  * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
 244  * \param[in] gm_grid             Global memory array of the grid to use.
 245  */
 246 template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
 247 __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
 248                                                    float* __restrict__ fy,
 249                                                    float* __restrict__ fz,
 250                                                    const int    ithyMin,
 251                                                    const int    ithyMax,
 252                                                    const int    ixBase,
 253                                                    const int    iz,
 254                                                    const int    nx,
 255                                                    const int    ny,
 256                                                    const int    pny,
 257                                                    const int    pnz,
 258                                                    const int    atomIndexLocal,
 259                                                    const int    splineIndexBase,
 260                                                    const float2 tdz,
 261                                                    const int* __restrict__ sm_gridlineIndices,
 262                                                    const float* __restrict__ sm_theta,
 263                                                    const float* __restrict__ sm_dtheta,
 264                                                    const float* __restrict__ gm_grid)
 265 {
 266 #pragma unroll
 267     for (int ithy = ithyMin; ithy < ithyMax; ithy++)
 268     {
 269         const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 270         const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
 271
 272         int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 273         if (wrapY & (iy >= ny))
 274         {
 275             iy -= ny;
 276         }
 277         const int constOffset = iy * pnz + iz;
 278
 279 #pragma unroll
 280         for (int ithx = 0; (ithx < order); ithx++)
 281         {
 282             int ix = ixBase + ithx;
 283             if (wrapX & (ix >= nx))
 284             {
 285                 ix -= nx;
 286             }
 287             const int gridIndexGlobal = ix * pny * pnz + constOffset;
 288             assert(gridIndexGlobal >= 0);
 289             const float gridValue = gm_grid[gridIndexGlobal];
 290             assert(isfinite(gridValue));
 291             const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 292             const float2 tdx       = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
 293             const float  fxy1      = tdz.x * gridValue;
 294             const float  fz1       = tdz.y * gridValue;
 295             *fx += tdx.y * tdy.x * fxy1;
 296             *fy += tdx.x * tdy.y * fxy1;
 297             *fz += tdx.x * tdy.x * fz1;
 298         }
 299     }
 300 }
 301
 302 /*! \brief Calculate the grid forces and store them in shared memory.
 303  *
 304  * \param[in,out] sm_forces       Shared memory array with the output forces.
 305  * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
 306  * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
 307  * \param[in] recipBox            The reciprocal box.
 308  * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
 309  * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
 310  * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
 311  * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
 312  * separate grids are used this should be the coefficients of the grid in question.
 313  * \param[in] gm_coefficientsB    Global memory array of the coefficients to use for FEP in state B.
 314  * Should be nullptr if two separate grids are used.
 315  */
 316 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
 317                                                             const int   forceIndexLocal,
 318                                                             const int   forceIndexGlobal,
 319                                                             const float recipBox[DIM][DIM],
 320                                                             const float scale,
 321                                                             const float* __restrict__ gm_coefficients)
 322 {
 323     const float3 atomForces     = sm_forces[forceIndexLocal];
 324     float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
 325     float3       result;
 326     result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
 327     result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
 328     result.z = negCoefficient
 329                * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
 330                   + recipBox[ZZ][ZZ] * atomForces.z);
 331     sm_forces[forceIndexLocal] = result;
 332 }
 333
 334 /*! \brief
 335  * A CUDA kernel which gathers the atom forces from the grid.
 336  * The grid is assumed to be wrapped in dimension Z.
 337  *
 338  * \tparam[in] order                The PME order (must be 4 currently).
 339  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
 340  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
 341  * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
 342  * \tparam[in] readGlobal           Tells if we should read spline values from global memory
 343  * \tparam[in] threadsPerAtom       How many threads work on each atom
 344  *
 345  * \param[in]  kernelParams         All the PME GPU data.
 346  */
 347 template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
 348 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
 349         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
 350 {
 351     assert(numGrids == 1 || numGrids == 2);
 352
 353     /* Global memory pointers */
 354     const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
 355     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
 356     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
 357     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
 358     static_assert(sizeof(*kernelParams.atoms.d_forces) == 3 * sizeof(float));
 359     float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
 360
 361     /* Global memory pointers for readGlobal */
 362     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
 363     const float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
 364     const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
 365
 366     float3 atomX;
 367     float  atomCharge;
 368
 369     const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
 370
 371     /* Number of data components and threads for a single atom */
 372     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
 373     const int atomDataSize        = threadsPerAtomValue;
 374     const int atomsPerBlock       = c_gatherMaxThreadsPerBlock / atomDataSize;
 375     // Number of atoms processed by a single warp in spread and gather
 376     const int atomsPerWarp = warp_size / atomDataSize;
 377
 378     const int blockSize = atomsPerBlock * atomDataSize;
 379     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
 380
 381     /* These are the atom indices - for the shared and global memory */
 382     const int atomIndexLocal  = threadIdx.z;
 383     const int atomIndexOffset = blockIndex * atomsPerBlock;
 384     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 385
 386     /* Early return for fully empty blocks at the end
 387      * (should only happen for billions of input atoms)
 388      */
 389     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 390     {
 391         return;
 392     }
 393     // 4 warps per block, 8 atoms per warp *3 *4
 394     const int        splineParamsSize    = atomsPerBlock * DIM * order;
 395     const int        gridlineIndicesSize = atomsPerBlock * DIM;
 396     __shared__ int   sm_gridlineIndices[gridlineIndicesSize];
 397     __shared__ float sm_theta[splineParamsSize];
 398     __shared__ float sm_dtheta[splineParamsSize];
 399
 400     /* Spline Z coordinates */
 401     const int ithz = threadIdx.x;
 402
 403     /* These are the spline contribution indices in shared memory */
 404     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
 405     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y))
 406                           + splineIndex; /* And to all the block's particles */
 407
 408     const int threadLocalId =
 409             (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
 410     const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
 411
 412     if (readGlobal)
 413     {
 414         /* Read splines */
 415         const int localGridlineIndicesIndex = threadLocalId;
 416         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
 417         if (localGridlineIndicesIndex < gridlineIndicesSize)
 418         {
 419             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 420             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 421         }
 422         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
 423            with order*order threads per atom, it is only required for each thread to load one data value */
 424
 425         const int iMin = 0;
 426         const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
 427
 428         for (int i = iMin; i < iMax; i++)
 429         {
 430             int localSplineParamsIndex =
 431                     threadLocalId
 432                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
 433             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
 434             if (localSplineParamsIndex < splineParamsSize)
 435             {
 436                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
 437                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
 438                 assert(isfinite(sm_theta[localSplineParamsIndex]));
 439                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
 440             }
 441         }
 442         __syncthreads();
 443     }
 444     else
 445     {
 446         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
 447         /* Recalculate  Splines  */
 448         if (c_useAtomDataPrefetch)
 449         {
 450             // charges
 451             __shared__ float sm_coefficients[atomsPerBlock];
 452             // Coordinates
 453             __shared__ float3 sm_coordinates[atomsPerBlock];
 454             /* Staging coefficients/charges */
 455             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
 456
 457             /* Staging coordinates */
 458             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
 459             __syncthreads();
 460             atomX      = sm_coordinates[atomIndexLocal];
 461             atomCharge = sm_coefficients[atomIndexLocal];
 462         }
 463         else
 464         {
 465             atomX      = gm_coordinates[atomIndexGlobal];
 466             atomCharge = gm_coefficientsA[atomIndexGlobal];
 467         }
 468         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids>(
 469                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
 470         __syncwarp();
 471     }
 472     float fx = 0.0F;
 473     float fy = 0.0F;
 474     float fz = 0.0F;
 475
 476     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
 477
 478     const int nx  = kernelParams.grid.realGridSize[XX];
 479     const int ny  = kernelParams.grid.realGridSize[YY];
 480     const int nz  = kernelParams.grid.realGridSize[ZZ];
 481     const int pny = kernelParams.grid.realGridSizePadded[YY];
 482     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 483
 484     const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 485     const int warpIndex     = atomIndexLocal / atomsPerWarp;
 486
 487     const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 488     const int    splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 489     const float2 tdz          = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
 490
 491     int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 492     const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 493
 494     if (iz >= nz)
 495     {
 496         iz -= nz;
 497     }
 498
 499     const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
 500     const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
 501     if (chargeCheck)
 502     {
 503         sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
 504                                                               &fy,
 505                                                               &fz,
 506                                                               ithyMin,
 507                                                               ithyMax,
 508                                                               ixBase,
 509                                                               iz,
 510                                                               nx,
 511                                                               ny,
 512                                                               pny,
 513                                                               pnz,
 514                                                               atomIndexLocal,
 515                                                               splineIndexBase,
 516                                                               tdz,
 517                                                               sm_gridlineIndices,
 518                                                               sm_theta,
 519                                                               sm_dtheta,
 520                                                               gm_gridA);
 521     }
 522     // Reduction of partial force contributions
 523     __shared__ float3 sm_forces[atomsPerBlock];
 524     reduce_atom_forces<order, atomDataSize, blockSize>(
 525             sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
 526     __syncthreads();
 527
 528     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 529     const int   forceIndexLocal  = threadLocalId;
 530     const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 531     const float scale            = kernelParams.current.scale;
 532     if (forceIndexLocal < atomsPerBlock)
 533     {
 534         calculateAndStoreGridForces(
 535                 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
 536     }
 537
 538     __syncwarp();
 539     assert(atomsPerBlock <= warp_size);
 540
 541     /* Writing or adding the final forces component-wise, single warp */
 542     const int blockForcesSize = atomsPerBlock * DIM;
 543     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 544     const int iterThreads     = blockForcesSize / numIter;
 545     if (threadLocalId < iterThreads)
 546     {
 547 #pragma unroll
 548         for (int i = 0; i < numIter; i++)
 549         {
 550             int   outputIndexLocal       = i * iterThreads + threadLocalId;
 551             int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
 552             float outputForceComponent   = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
 553             gm_forces[outputIndexGlobal] = outputForceComponent;
 554         }
 555     }
 556
 557     if (numGrids == 2)
 558     {
 559         /* We must sync here since the same shared memory is used as above. */
 560         __syncthreads();
 561         fx                    = 0.0F;
 562         fy                    = 0.0F;
 563         fz                    = 0.0F;
 564         const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
 565         if (chargeCheck)
 566         {
 567             sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
 568                                                                   &fy,
 569                                                                   &fz,
 570                                                                   ithyMin,
 571                                                                   ithyMax,
 572                                                                   ixBase,
 573                                                                   iz,
 574                                                                   nx,
 575                                                                   ny,
 576                                                                   pny,
 577                                                                   pnz,
 578                                                                   atomIndexLocal,
 579                                                                   splineIndexBase,
 580                                                                   tdz,
 581                                                                   sm_gridlineIndices,
 582                                                                   sm_theta,
 583                                                                   sm_dtheta,
 584                                                                   gm_gridB);
 585         }
 586         // Reduction of partial force contributions
 587         reduce_atom_forces<order, atomDataSize, blockSize>(
 588                 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
 589         __syncthreads();
 590
 591         /* Calculating the final forces with no component branching, atomsPerBlock threads */
 592         if (forceIndexLocal < atomsPerBlock)
 593         {
 594             calculateAndStoreGridForces(sm_forces,
 595                                         forceIndexLocal,
 596                                         forceIndexGlobal,
 597                                         kernelParams.current.recipBox,
 598                                         1.0F - scale,
 599                                         gm_coefficientsB);
 600         }
 601
 602         __syncwarp();
 603
 604         /* Writing or adding the final forces component-wise, single warp */
 605         if (threadLocalId < iterThreads)
 606         {
 607 #pragma unroll
 608             for (int i = 0; i < numIter; i++)
 609             {
 610                 int   outputIndexLocal     = i * iterThreads + threadLocalId;
 611                 int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
 612                 float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
 613                 gm_forces[outputIndexGlobal] += outputForceComponent;
 614             }
 615         }
 616     }
 617 }
 618
 619 //! Kernel instantiations
 620 // clang-format off
 621 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
 622 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
 623 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 624 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 625 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
 626 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
 627 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 628 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 629 // clang-format on