src/gromacs/ewald/pme_gather.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME force gathering in CUDA.
  38  *
  39  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include <cassert>
  45
  46 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  47 #include "gromacs/gpu_utils/typecasts.cuh"
  48
  49 #include "pme.cuh"
  50 #include "pme_gpu_calculate_splines.cuh"
  51 #include "pme_grid.h"
  52
  53 /*! \brief
  54  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  55  */
  56 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
  57 {
  58     switch (dimIndex)
  59     {
  60         case XX: return realGridSizeFP[XX];
  61         case YY: return realGridSizeFP[YY];
  62         case ZZ: return realGridSizeFP[ZZ];
  63     }
  64     assert(false);
  65     return 0.0f;
  66 }
  67
  68 /*! \brief Reduce the partial force contributions.
  69  *
  70  * \tparam[in] order              The PME order (must be 4).
  71  * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently
  72  * order^2 == 16) \tparam[in] blockSize          The CUDA block size \param[out] sm_forces Shared
  73  * memory array with the output forces (number of elements is number of atoms per block) \param[in]
  74  * atomIndexLocal     Local atom index \param[in]  splineIndex        Spline index \param[in]
  75  * lineIndex          Line index (same as threadLocalId) \param[in]  realGridSizeFP     Local grid
  76  * size constant \param[in]  fx                 Input force partial component X \param[in]  fy Input
  77  * force partial component Y \param[in]  fz                 Input force partial component Z
  78  */
  79 template<const int order, const int atomDataSize, const int blockSize>
  80 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
  81                                                    const int    atomIndexLocal,
  82                                                    const int    splineIndex,
  83                                                    const int    lineIndex,
  84                                                    const float* realGridSizeFP,
  85                                                    float&       fx,
  86                                                    float&       fy,
  87                                                    float&       fz)
  88 {
  89     if (!(order & (order - 1))) // Only for orders of power of 2
  90     {
  91         const unsigned int activeMask = c_fullWarpMask;
  92
  93         // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
  94         // TODO: find out if this is the best in terms of transactions count
  95         static_assert(order == 4, "Only order of 4 is implemented");
  96         static_assert(atomDataSize <= warp_size,
  97                       "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
  98         const int width = atomDataSize;
  99
 100         fx += __shfl_down_sync(activeMask, fx, 1, width);
 101         fy += __shfl_up_sync(activeMask, fy, 1, width);
 102         fz += __shfl_down_sync(activeMask, fz, 1, width);
 103
 104         if (splineIndex & 1)
 105         {
 106             fx = fy;
 107         }
 108
 109         fx += __shfl_down_sync(activeMask, fx, 2, width);
 110         fz += __shfl_up_sync(activeMask, fz, 2, width);
 111
 112         if (splineIndex & 2)
 113         {
 114             fx = fz;
 115         }
 116
 117         // By now fx contains intermediate quad sums of all 3 components:
 118         // splineIndex    0            1            2 and 3      4            5            6 and 7 8...
 119         // sum of...      fx0 to fx3   fy0 to fy3   fz0 to fz3   fx4 to fx7   fy4 to fy7   fz4 to fz7 etc.
 120
 121         // We have to just further reduce those groups of 4
 122         for (int delta = 4; delta < atomDataSize; delta <<= 1)
 123         {
 124             fx += __shfl_down_sync(activeMask, fx, delta, width);
 125         }
 126
 127         const int dimIndex = splineIndex;
 128         if (dimIndex < DIM)
 129         {
 130             const float n = read_grid_size(realGridSizeFP, dimIndex);
 131             *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
 132         }
 133     }
 134     else
 135     {
 136         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
 137         // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
 138         const int         smemPerDim   = warp_size;
 139         const int         smemReserved = (DIM)*smemPerDim;
 140         __shared__ float  sm_forceReduction[smemReserved + blockSize];
 141         __shared__ float* sm_forceTemp[DIM];
 142
 143         const int numWarps = blockSize / smemPerDim;
 144         const int minStride =
 145                 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
 146
 147 #pragma unroll
 148         for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 149         {
 150             int elementIndex = smemReserved + lineIndex;
 151             // Store input force contributions
 152             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 153             // sync here because two warps write data that the first one consumes below
 154             __syncthreads();
 155             // Reduce to fit into smemPerDim (warp size)
 156 #pragma unroll
 157             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
 158             {
 159                 if (splineIndex < redStride)
 160                 {
 161                     sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 162                 }
 163             }
 164             __syncthreads();
 165             // Last iteration - packing everything to be nearby, storing convenience pointer
 166             sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 167             int redStride          = minStride;
 168             if (splineIndex < redStride)
 169             {
 170                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
 171                 sm_forceTemp[dimIndex][packedIndex] =
 172                         sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 173             }
 174             __syncthreads();
 175         }
 176
 177         assert((blockSize / warp_size) >= DIM);
 178         // assert (atomsPerBlock <= warp_size);
 179
 180         const int warpIndex = lineIndex / warp_size;
 181         const int dimIndex  = warpIndex;
 182
 183         // First 3 warps can now process 1 dimension each
 184         if (dimIndex < DIM)
 185         {
 186             int sourceIndex = lineIndex % warp_size;
 187 #pragma unroll
 188             for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
 189             {
 190                 if (!(splineIndex & redStride))
 191                 {
 192                     sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 193                 }
 194             }
 195
 196             __syncwarp();
 197
 198             const float n         = read_grid_size(realGridSizeFP, dimIndex);
 199             const int   atomIndex = sourceIndex / minStride;
 200
 201             if (sourceIndex == minStride * atomIndex)
 202             {
 203                 *((float*)(&sm_forces[atomIndex]) + dimIndex) =
 204                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 205             }
 206         }
 207     }
 208 }
 209
 210 /*! \brief
 211  * A CUDA kernel which gathers the atom forces from the grid.
 212  * The grid is assumed to be wrapped in dimension Z.
 213  *
 214  * \tparam[in] order                The PME order (must be 4 currently).
 215  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
 216  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
 217  * \tparam[in] readGlobal           Tells if we should read spline values from global memory
 218  * \tparam[in] useOrderThreads      Tells if we should use order threads per atom
 219  *                                   (order*order used if false)
 220  * \param[in]  kernelParams         All the PME GPU data.
 221  */
 222 template<const int order, const bool wrapX, const bool wrapY, const bool readGlobal, const bool useOrderThreads>
 223 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
 224         void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
 225 {
 226     /* Global memory pointers */
 227     const float* __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
 228     const float* __restrict__ gm_grid         = kernelParams.grid.d_realGrid;
 229     float* __restrict__ gm_forces             = kernelParams.atoms.d_forces;
 230
 231     /* Global memory pointers for readGlobal */
 232     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
 233     const float* __restrict__ gm_dtheta        = kernelParams.atoms.d_dtheta;
 234     const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
 235
 236     float3 atomX;
 237     float  atomCharge;
 238
 239     /* Some sizes */
 240     const int atomsPerBlock =
 241             useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom)
 242                             : (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
 243     const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
 244
 245     /* Number of data components and threads for a single atom */
 246     const int atomDataSize = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
 247                                              : c_pmeSpreadGatherThreadsPerAtom;
 248     const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom
 249                                              : c_pmeSpreadGatherAtomsPerWarp;
 250
 251     const int blockSize = atomsPerBlock * atomDataSize;
 252     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
 253
 254     /* These are the atom indices - for the shared and global memory */
 255     const int atomIndexLocal  = threadIdx.z;
 256     const int atomIndexOffset = blockIndex * atomsPerBlock;
 257     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 258
 259     /* Early return for fully empty blocks at the end
 260      * (should only happen for billions of input atoms)
 261      */
 262     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 263     {
 264         return;
 265     }
 266     // 4 warps per block, 8 atoms per warp *3 *4
 267     const int        splineParamsSize    = atomsPerBlock * DIM * order;
 268     const int        gridlineIndicesSize = atomsPerBlock * DIM;
 269     __shared__ int   sm_gridlineIndices[gridlineIndicesSize];
 270     __shared__ float sm_theta[splineParamsSize];
 271     __shared__ float sm_dtheta[splineParamsSize];
 272
 273     /* Spline Z coordinates */
 274     const int ithz = threadIdx.x;
 275
 276     /* These are the spline contribution indices in shared memory */
 277     const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
 278     const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y))
 279                           + splineIndex; /* And to all the block's particles */
 280
 281     const int threadLocalId =
 282             (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
 283     const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
 284
 285     if (readGlobal)
 286     {
 287         /* Read splines */
 288         const int localGridlineIndicesIndex = threadLocalId;
 289         const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
 290         const int globalCheckIndices         = pme_gpu_check_atom_data_index(
 291                 globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
 292         if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
 293         {
 294             sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 295             assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 296         }
 297         /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
 298            with order*order threads per atom, it is only required for each thread to load one data value */
 299
 300         const int iMin = 0;
 301         const int iMax = useOrderThreads ? 3 : 1;
 302
 303         for (int i = iMin; i < iMax; i++)
 304         {
 305             int localSplineParamsIndex =
 306                     threadLocalId
 307                     + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
 308             int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
 309             int globalCheckSplineParams = pme_gpu_check_atom_data_index(
 310                     globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
 311             if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
 312             {
 313                 sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
 314                 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
 315                 assert(isfinite(sm_theta[localSplineParamsIndex]));
 316                 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
 317             }
 318         }
 319         __syncthreads();
 320     }
 321     else
 322     {
 323         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
 324         /* Recaclulate  Splines  */
 325         if (c_useAtomDataPrefetch)
 326         {
 327             // charges
 328             __shared__ float sm_coefficients[atomsPerBlock];
 329             // Coordinates
 330             __shared__ float3 sm_coordinates[atomsPerBlock];
 331             /* Staging coefficients/charges */
 332             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
 333
 334             /* Staging coordinates */
 335             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
 336             __syncthreads();
 337             atomX      = sm_coordinates[atomIndexLocal];
 338             atomCharge = sm_coefficients[atomIndexLocal];
 339         }
 340         else
 341         {
 342             atomX      = gm_coordinates[atomIndexGlobal];
 343             atomCharge = gm_coefficients[atomIndexGlobal];
 344         }
 345         calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
 346                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
 347         __syncwarp();
 348     }
 349     float fx = 0.0f;
 350     float fy = 0.0f;
 351     float fz = 0.0f;
 352
 353     const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 354     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 355
 356     if (chargeCheck & globalCheck)
 357     {
 358         const int nx  = kernelParams.grid.realGridSize[XX];
 359         const int ny  = kernelParams.grid.realGridSize[YY];
 360         const int nz  = kernelParams.grid.realGridSize[ZZ];
 361         const int pny = kernelParams.grid.realGridSizePadded[YY];
 362         const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 363
 364         const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 365         const int warpIndex     = atomIndexLocal / atomsPerWarp;
 366
 367         const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 368         const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 369         const float2 tdz       = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
 370
 371         int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 372         const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 373
 374         if (iz >= nz)
 375         {
 376             iz -= nz;
 377         }
 378         int constOffset, iy;
 379
 380         const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
 381         const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
 382         for (int ithy = ithyMin; ithy < ithyMax; ithy++)
 383         {
 384             const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 385             const float2 tdy       = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
 386
 387             iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 388             if (wrapY & (iy >= ny))
 389             {
 390                 iy -= ny;
 391             }
 392             constOffset = iy * pnz + iz;
 393
 394 #pragma unroll
 395             for (int ithx = 0; (ithx < order); ithx++)
 396             {
 397                 int ix = ixBase + ithx;
 398                 if (wrapX & (ix >= nx))
 399                 {
 400                     ix -= nx;
 401                 }
 402                 const int gridIndexGlobal = ix * pny * pnz + constOffset;
 403                 assert(gridIndexGlobal >= 0);
 404                 const float gridValue = gm_grid[gridIndexGlobal];
 405                 assert(isfinite(gridValue));
 406                 const int splineIndexX =
 407                         getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 408                 const float2 tdx  = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
 409                 const float  fxy1 = tdz.x * gridValue;
 410                 const float  fz1  = tdz.y * gridValue;
 411                 fx += tdx.y * tdy.x * fxy1;
 412                 fy += tdx.x * tdy.y * fxy1;
 413                 fz += tdx.x * tdy.x * fz1;
 414             }
 415         }
 416     }
 417
 418     // Reduction of partial force contributions
 419     __shared__ float3 sm_forces[atomsPerBlock];
 420     reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex, lineIndex,
 421                                                        kernelParams.grid.realGridSizeFP, fx, fy, fz);
 422     __syncthreads();
 423
 424     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 425     const int forceIndexLocal  = threadLocalId;
 426     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 427     const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
 428     if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
 429     {
 430         const float3 atomForces     = sm_forces[forceIndexLocal];
 431         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
 432         float3       result;
 433         result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
 434         result.y = negCoefficient
 435                    * (kernelParams.current.recipBox[XX][YY] * atomForces.x
 436                       + kernelParams.current.recipBox[YY][YY] * atomForces.y);
 437         result.z = negCoefficient
 438                    * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
 439                       + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
 440                       + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
 441         sm_forces[forceIndexLocal] = result;
 442     }
 443
 444     __syncwarp();
 445     assert(atomsPerBlock <= warp_size);
 446
 447     /* Writing or adding the final forces component-wise, single warp */
 448     const int blockForcesSize = atomsPerBlock * DIM;
 449     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 450     const int iterThreads     = blockForcesSize / numIter;
 451     if (threadLocalId < iterThreads)
 452     {
 453 #pragma unroll
 454         for (int i = 0; i < numIter; i++)
 455         {
 456             int       outputIndexLocal  = i * iterThreads + threadLocalId;
 457             int       outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
 458             const int globalOutputCheck =
 459                     pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
 460             if (globalOutputCheck)
 461             {
 462                 const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
 463                 gm_forces[outputIndexGlobal]     = outputForceComponent;
 464             }
 465         }
 466     }
 467 }
 468
 469 //! Kernel instantiations
 470 template __global__ void pme_gather_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
 471 template __global__ void pme_gather_kernel<4, true, true, true, false>(const PmeGpuCudaKernelParams);
 472 template __global__ void pme_gather_kernel<4, true, true, false, true>(const PmeGpuCudaKernelParams);
 473 template __global__ void pme_gather_kernel<4, true, true, false, false>(const PmeGpuCudaKernelParams);