src/gromacs/ewald/pme_spread.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /*! \internal \file
  39  *  \brief Implements PME GPU spline calculation and charge spreading in CUDA.
  40  *  TODO: consider always pre-sorting particles (as in DD case).
  41  *
  42  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include <cassert>
  48
  49 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  50 #include "gromacs/gpu_utils/typecasts.cuh"
  51
  52 #include "pme.cuh"
  53 #include "pme_gpu_calculate_splines.cuh"
  54 #include "pme_grid.h"
  55
  56 /*! \brief
  57  * Charge spreading onto the grid.
  58  * This corresponds to the CPU function spread_coefficients_bsplines_thread().
  59  * Optional second stage of the spline_and_spread_kernel.
  60  *
  61  * \tparam[in] order                PME interpolation order.
  62  * \tparam[in] wrapX                Whether the grid overlap in dimension X should be wrapped.
  63  * \tparam[in] wrapY                Whether the grid overlap in dimension Y should be wrapped.
  64  * \tparam[in] useOrderThreads      Whether we should use order threads per atom (order*order used if false).
  65  *
  66  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  67  * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
  68  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  69  * \param[in]  sm_theta             Atom spline values in the shared memory.
  70  */
  71 template<const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads>
  72 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
  73                                                const float*                 atomCharge,
  74                                                const int* __restrict__ sm_gridlineIndices,
  75                                                const float* __restrict__ sm_theta)
  76 {
  77     /* Global memory pointer to the output grid */
  78     float* __restrict__ gm_grid = kernelParams.grid.d_realGrid;
  79
  80
  81     const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom
  82                                              : c_pmeSpreadGatherAtomsPerWarp;
  83
  84     const int nx  = kernelParams.grid.realGridSize[XX];
  85     const int ny  = kernelParams.grid.realGridSize[YY];
  86     const int nz  = kernelParams.grid.realGridSize[ZZ];
  87     const int pny = kernelParams.grid.realGridSizePadded[YY];
  88     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
  89
  90     const int offx = 0, offy = 0, offz = 0; // unused for now
  91
  92     const int atomIndexLocal = threadIdx.z;
  93
  94     const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
  95     if (chargeCheck)
  96     {
  97         // Spline Z coordinates
  98         const int ithz = threadIdx.x;
  99
 100         const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
 101         const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
 102         int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
 103         if (iz >= nz)
 104         {
 105             iz -= nz;
 106         }
 107         /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
 108         const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 109         /* Warp index w.r.t. block - could probably be obtained easier? */
 110         const int warpIndex = atomIndexLocal / atomsPerWarp;
 111
 112         const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 113         const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 114         const float thetaZ     = sm_theta[splineIndexZ];
 115
 116         /* loop not used if order*order threads per atom */
 117         const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
 118         const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
 119         for (int ithy = ithyMin; ithy < ithyMax; ithy++)
 120         {
 121             int iy = iyBase + ithy;
 122             if (wrapY & (iy >= ny))
 123             {
 124                 iy -= ny;
 125             }
 126
 127             const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 128             float       thetaY = sm_theta[splineIndexY];
 129             const float Val    = thetaZ * thetaY * (*atomCharge);
 130             assert(isfinite(Val));
 131             const int offset = iy * pnz + iz;
 132
 133 #pragma unroll
 134             for (int ithx = 0; (ithx < order); ithx++)
 135             {
 136                 int ix = ixBase + ithx;
 137                 if (wrapX & (ix >= nx))
 138                 {
 139                     ix -= nx;
 140                 }
 141                 const int gridIndexGlobal = ix * pny * pnz + offset;
 142                 const int splineIndexX =
 143                         getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 144                 const float thetaX = sm_theta[splineIndexX];
 145                 assert(isfinite(thetaX));
 146                 assert(isfinite(gm_grid[gridIndexGlobal]));
 147                 atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
 148             }
 149         }
 150     }
 151 }
 152
 153 /*! \brief
 154  * A spline computation and charge spreading kernel function.
 155  *
 156  * Two tuning parameters can be used for additional performance. For small systems and for debugging
 157  * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
 158  * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
 159  *
 160  * \tparam[in] order                PME interpolation order.
 161  * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
 162  *                                  gridline indices' computation should be performed.
 163  * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
 164  * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
 165  * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
 166  * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
 167  * \tparam[in] useOrderThreads         A boolean which tells if we should use order threads per atom (order*order used if false).
 168  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
 169  */
 170 template<const int order, const bool computeSplines, const bool spreadCharges, const bool wrapX, const bool wrapY, const bool writeGlobal, const bool useOrderThreads>
 171 __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
 172         void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
 173 {
 174     const int atomsPerBlock =
 175             useOrderThreads ? c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
 176                             : c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
 177     // Gridline indices, ivec
 178     __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
 179     // Spline values
 180     __shared__ float sm_theta[atomsPerBlock * DIM * order];
 181     float            dtheta;
 182
 183     const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom
 184                                              : c_pmeSpreadGatherAtomsPerWarp;
 185
 186     float3 atomX;
 187     float  atomCharge;
 188
 189     const int blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
 190     const int atomIndexOffset = blockIndex * atomsPerBlock;
 191
 192     /* Thread index w.r.t. block */
 193     const int threadLocalId =
 194             (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
 195     /* Warp index w.r.t. block - could probably be obtained easier? */
 196     const int warpIndex = threadLocalId / warp_size;
 197
 198     /* Atom index w.r.t. warp */
 199     const int atomWarpIndex = threadIdx.z % atomsPerWarp;
 200     /* Atom index w.r.t. block/shared memory */
 201     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 202     /* Atom index w.r.t. global memory */
 203     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 204
 205     /* Early return for fully empty blocks at the end
 206      * (should only happen for billions of input atoms)
 207      */
 208     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 209     {
 210         return;
 211     }
 212     /* Charges, required for both spline and spread */
 213     if (c_useAtomDataPrefetch)
 214     {
 215         __shared__ float sm_coefficients[atomsPerBlock];
 216         pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
 217         __syncthreads();
 218         atomCharge = sm_coefficients[atomIndexLocal];
 219     }
 220     else
 221     {
 222         atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal];
 223     }
 224
 225     if (computeSplines)
 226     {
 227         const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
 228         if (c_useAtomDataPrefetch)
 229         {
 230             // Coordinates
 231             __shared__ float3 sm_coordinates[atomsPerBlock];
 232
 233             /* Staging coordinates */
 234             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
 235             __syncthreads();
 236             atomX = sm_coordinates[atomIndexLocal];
 237         }
 238         else
 239         {
 240             atomX = gm_coordinates[atomIndexGlobal];
 241         }
 242         calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
 243                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
 244         __syncwarp();
 245     }
 246     else
 247     {
 248         /* Staging the data for spread
 249          * (the data is assumed to be in GPU global memory with proper layout already,
 250          * as in after running the spline kernel)
 251          */
 252         /* Spline data - only thetas (dthetas will only be needed in gather) */
 253         pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
 254         /* Gridline indices */
 255         pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
 256                                                          kernelParams.atoms.d_gridlineIndices);
 257
 258         __syncthreads();
 259     }
 260
 261     /* Spreading */
 262     if (spreadCharges)
 263     {
 264         spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, &atomCharge,
 265                                                              sm_gridlineIndices, sm_theta);
 266     }
 267 }
 268
 269 //! Kernel instantiations
 270 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, true>(const PmeGpuCudaKernelParams);
 271 template __global__ void
 272 pme_spline_and_spread_kernel<4, true, false, true, true, true, true>(const PmeGpuCudaKernelParams);
 273 template __global__ void
 274 pme_spline_and_spread_kernel<4, false, true, true, true, true, true>(const PmeGpuCudaKernelParams);
 275
 276 template __global__ void
 277 pme_spline_and_spread_kernel<4, true, true, true, true, false, true>(const PmeGpuCudaKernelParams);
 278
 279 template __global__ void
 280 pme_spline_and_spread_kernel<4, true, true, true, true, true, false>(const PmeGpuCudaKernelParams);
 281 template __global__ void
 282 pme_spline_and_spread_kernel<4, true, false, true, true, true, false>(const PmeGpuCudaKernelParams);
 283 template __global__ void
 284 pme_spline_and_spread_kernel<4, false, true, true, true, true, false>(const PmeGpuCudaKernelParams);
 285
 286 template __global__ void
 287 pme_spline_and_spread_kernel<4, true, true, true, true, false, false>(const PmeGpuCudaKernelParams);