src/gromacs/ewald/pme-gather.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME force gathering in CUDA.
  38  *
  39  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include <cassert>
  45
  46 #include "pme.cuh"
  47 #include "pme-gpu-utils.h"
  48
  49 /*! \brief
  50  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  51  */
  52 __device__ __forceinline__ float read_grid_size(const float *realGridSizeFP,
  53                                                 const int    dimIndex)
  54 {
  55     switch (dimIndex)
  56     {
  57         case XX: return realGridSizeFP[XX];
  58         case YY: return realGridSizeFP[YY];
  59         case ZZ: return realGridSizeFP[ZZ];
  60     }
  61     assert(false);
  62     return 0.0f;
  63 }
  64
  65 /*! \brief Reduce the partial force contributions.
  66  *
  67  * \tparam[in] order              The PME order (must be 4).
  68  * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently order^2 == 16)
  69  * \tparam[in] blockSize          The CUDA block size
  70  * \param[out] sm_forces          Shared memory array with the output forces (number of elements is number of atoms per block)
  71  * \param[in]  atomIndexLocal     Local atom index
  72  * \param[in]  splineIndex        Spline index
  73  * \param[in]  lineIndex          Line index (same as threadLocalId)
  74  * \param[in]  realGridSizeFP     Local grid size constant
  75  * \param[in]  fx                 Input force partial component X
  76  * \param[in]  fy                 Input force partial component Y
  77  * \param[in]  fz                 Input force partial component Z
  78  */
  79 template <
  80     const int order,
  81     const int atomDataSize,
  82     const int blockSize
  83     >
  84 __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forces,
  85                                                    const int             atomIndexLocal,
  86                                                    const int             splineIndex,
  87                                                    const int             lineIndex,
  88                                                    const float          *realGridSizeFP,
  89                                                    float                &fx,
  90                                                    float                &fy,
  91                                                    float                &fz)
  92 {
  93 #if (GMX_PTX_ARCH >= 300)
  94     if (!(order & (order - 1))) // Only for orders of power of 2
  95     {
  96         const unsigned int activeMask = c_fullWarpMask;
  97
  98         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
  99         // TODO: find out if this is the best in terms of transactions count
 100         static_assert(order == 4, "Only order of 4 is implemented");
 101         static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
 102         const int width = atomDataSize;
 103
 104         fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
 105         fy += gmx_shfl_up_sync  (activeMask, fy, 1, width);
 106         fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
 107
 108         if (splineIndex & 1)
 109         {
 110             fx = fy;
 111         }
 112
 113         fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
 114         fz += gmx_shfl_up_sync  (activeMask, fz, 2, width);
 115
 116         if (splineIndex & 2)
 117         {
 118             fx = fz;
 119         }
 120
 121         // By now fx contains intermediate quad sums of all 3 components:
 122         // splineIndex    0            1            2 and 3      4            5            6 and 7      8...
 123         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7   etc.
 124
 125         // We have to just further reduce those groups of 4
 126         for (int delta = 4; delta < atomDataSize; delta <<= 1)
 127         {
 128             fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
 129         }
 130
 131         const int dimIndex = splineIndex;
 132         if (dimIndex < DIM)
 133         {
 134             const float n = read_grid_size(realGridSizeFP, dimIndex);
 135             *((float *)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
 136         }
 137     }
 138     else
 139 #endif
 140     {
 141         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
 142         // which are stored separately (first 2 dimensions only)
 143         const int         smemPerDim   = warp_size;
 144         const int         smemReserved = (DIM - 1) * smemPerDim;
 145         __shared__ float  sm_forceReduction[smemReserved + blockSize];
 146         __shared__ float *sm_forceTemp[DIM];
 147
 148         const int         numWarps  = blockSize / smemPerDim;
 149         const int         minStride = max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
 150
 151 #pragma unroll
 152         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 153         {
 154             int elementIndex = smemReserved + lineIndex;
 155             // Store input force contributions
 156             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 157             // Reduce to fit into smemPerDim (warp size)
 158 #pragma unroll
 159             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
 160             {
 161                 if (splineIndex < redStride)
 162                 {
 163                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 164                 }
 165             }
 166             __syncthreads();
 167             // Last iteration - packing everything to be nearby, storing convenience pointer
 168             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 169             int redStride = minStride;
 170             if (splineIndex < redStride)
 171             {
 172                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
 173                 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 174             }
 175         }
 176
 177         __syncthreads();
 178
 179         assert ((blockSize / warp_size) >= DIM);
 180         //assert (atomsPerBlock <= warp_size);
 181
 182         const int warpIndex = lineIndex / warp_size;
 183         const int dimIndex  = warpIndex;
 184
 185         // First 3 warps can now process 1 dimension each
 186         if (dimIndex < DIM)
 187         {
 188             int sourceIndex = lineIndex % warp_size;
 189 #pragma unroll
 190             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
 191             {
 192                 if (!(splineIndex & redStride))
 193                 {
 194                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 195                 }
 196             }
 197
 198             const float n = read_grid_size(realGridSizeFP, dimIndex);
 199
 200             const int   atomIndex = sourceIndex / minStride;
 201             if (sourceIndex == minStride * atomIndex)
 202             {
 203                 *((float *)(&sm_forces[atomIndex]) + dimIndex) = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 204             }
 205         }
 206     }
 207 }
 208
 209 /*! \brief
 210  * A CUDA kernel which gathers the atom forces from the grid.
 211  * The grid is assumed to be wrapped in dimension Z.
 212  *
 213  * \tparam[in] order                The PME order (must be 4 currently).
 214  * \tparam[in] overwriteForces      True: the forces are written to the output buffer;
 215  *                                  False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
 216  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
 217  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
 218  * \param[in]  kernelParams         All the PME GPU data.
 219  */
 220 template <
 221     const int order,
 222     const bool overwriteForces,
 223     const bool wrapX,
 224     const bool wrapY
 225     >
 226 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
 227 __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
 228 {
 229     /* Global memory pointers */
 230     const float * __restrict__  gm_coefficients     = kernelParams.atoms.d_coefficients;
 231     const float * __restrict__  gm_grid             = kernelParams.grid.d_realGrid;
 232     const float * __restrict__  gm_theta            = kernelParams.atoms.d_theta;
 233     const float * __restrict__  gm_dtheta           = kernelParams.atoms.d_dtheta;
 234     const int * __restrict__    gm_gridlineIndices  = kernelParams.atoms.d_gridlineIndices;
 235     float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
 236
 237     /* Some sizes */
 238     const int    atomsPerBlock  = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
 239     const int    atomDataSize   = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
 240     const int    blockSize      = atomsPerBlock * atomDataSize;
 241     const int    atomsPerWarp   = PME_SPREADGATHER_ATOMS_PER_WARP;
 242
 243     const int    blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
 244
 245     /* These are the atom indices - for the shared and global memory */
 246     const int         atomIndexLocal    = threadIdx.z;
 247     const int         atomIndexOffset   = blockIndex * atomsPerBlock;
 248     const int         atomIndexGlobal   = atomIndexOffset + atomIndexLocal;
 249
 250     /* Early return for fully empty blocks at the end
 251      * (should only happen on Fermi or billions of input atoms)
 252      */
 253     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 254     {
 255         return;
 256     }
 257
 258     const int         splineParamsSize             = atomsPerBlock * DIM * order;
 259     const int         gridlineIndicesSize          = atomsPerBlock * DIM;
 260     __shared__ int    sm_gridlineIndices[gridlineIndicesSize];
 261     __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
 262
 263     /* Spline Y/Z coordinates */
 264     const int ithy = threadIdx.y;
 265     const int ithz = threadIdx.x;
 266
 267     /* These are the spline contribution indices in shared memory */
 268     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;                  /* Relative to the current particle , 0..15 for order 4 */
 269     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
 270
 271     int       threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
 272         + (threadIdx.y * blockDim.x)
 273         + threadIdx.x;
 274
 275     /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
 276     const int localGridlineIndicesIndex  = threadLocalId;
 277     const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
 278     const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
 279     if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
 280     {
 281         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 282         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 283     }
 284     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
 285     const int localSplineParamsIndex  = threadLocalId;
 286     const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
 287     const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
 288     if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
 289     {
 290         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
 291         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
 292         assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
 293         assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
 294     }
 295     __syncthreads();
 296
 297     float           fx = 0.0f;
 298     float           fy = 0.0f;
 299     float           fz = 0.0f;
 300
 301     const int       globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 302     const int       chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 303
 304     if (chargeCheck & globalCheck)
 305     {
 306         const int    nx        = kernelParams.grid.realGridSize[XX];
 307         const int    ny        = kernelParams.grid.realGridSize[YY];
 308         const int    nz        = kernelParams.grid.realGridSize[ZZ];
 309         const int    pny       = kernelParams.grid.realGridSizePadded[YY];
 310         const int    pnz       = kernelParams.grid.realGridSizePadded[ZZ];
 311
 312         const int    atomWarpIndex = atomIndexLocal % atomsPerWarp;
 313         const int    warpIndex     = atomIndexLocal / atomsPerWarp;
 314
 315         const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 316         const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 317         const float2 tdy             = sm_splineParams[splineIndexY];
 318         const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 319         const float2 tdz             = sm_splineParams[splineIndexZ];
 320
 321         const int    ixBase         = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 322         int          iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 323         if (wrapY & (iy >= ny))
 324         {
 325             iy -= ny;
 326         }
 327         int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 328         if (iz >= nz)
 329         {
 330             iz -= nz;
 331         }
 332         const int constOffset    = iy * pnz + iz;
 333
 334 #pragma unroll
 335         for (int ithx = 0; (ithx < order); ithx++)
 336         {
 337             int ix = ixBase + ithx;
 338             if (wrapX & (ix >= nx))
 339             {
 340                 ix -= nx;
 341             }
 342             const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
 343             assert(gridIndexGlobal >= 0);
 344             const float   gridValue    = gm_grid[gridIndexGlobal];
 345             assert(isfinite(gridValue));
 346             const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 347             const float2  tdx          = sm_splineParams[splineIndexX];
 348             const float   fxy1         = tdz.x * gridValue;
 349             const float   fz1          = tdz.y * gridValue;
 350             fx += tdx.y * tdy.x * fxy1;
 351             fy += tdx.x * tdy.y * fxy1;
 352             fz += tdx.x * tdy.x * fz1;
 353         }
 354     }
 355
 356     // Reduction of partial force contributions
 357     __shared__ float3 sm_forces[atomsPerBlock];
 358     reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces,
 359                                                        atomIndexLocal, splineIndex, lineIndex,
 360                                                        kernelParams.grid.realGridSizeFP,
 361                                                        fx, fy, fz);
 362     __syncthreads();
 363
 364     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 365     const int forceIndexLocal  = threadLocalId;
 366     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 367     const int calcIndexCheck   = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
 368     if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
 369     {
 370         const float3  atomForces               = sm_forces[forceIndexLocal];
 371         const float   negCoefficient           = -gm_coefficients[forceIndexGlobal];
 372         float3        result;
 373         result.x                   = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
 374         result.y                   = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x + kernelParams.current.recipBox[YY][YY] * atomForces.y);
 375         result.z                   = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x + kernelParams.current.recipBox[YY][ZZ] * atomForces.y + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
 376         sm_forces[forceIndexLocal] = result;
 377     }
 378
 379     gmx_syncwarp();
 380     assert(atomsPerBlock <= warp_size);
 381
 382     /* Writing or adding the final forces component-wise, single warp */
 383     const int blockForcesSize = atomsPerBlock * DIM;
 384     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 385     const int iterThreads     = blockForcesSize / numIter;
 386     if (threadLocalId < iterThreads)
 387     {
 388 #pragma unroll
 389         for (int i = 0; i < numIter; i++)
 390         {
 391             int         outputIndexLocal  = i * iterThreads + threadLocalId;
 392             int         outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
 393             const int   globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
 394             if (globalOutputCheck)
 395             {
 396                 const float outputForceComponent = ((float *)sm_forces)[outputIndexLocal];
 397                 if (overwriteForces)
 398                 {
 399                     gm_forces[outputIndexGlobal] = outputForceComponent;
 400                 }
 401                 else
 402                 {
 403                     gm_forces[outputIndexGlobal] += outputForceComponent;
 404                 }
 405             }
 406         }
 407     }
 408 }
 409
 410 //! Kernel instantiations
 411 template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
 412 template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);