src/gromacs/ewald/pme_spread.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013-2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /*! \internal \file
  39  *  \brief Implements PME GPU spline calculation and charge spreading in CUDA.
  40  *  TODO: consider always pre-sorting particles (as in DD case).
  41  *
  42  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include <cassert>
  48
  49 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  50 #include "gromacs/gpu_utils/typecasts.cuh"
  51
  52 #include "pme.cuh"
  53 #include "pme_gpu_calculate_splines.cuh"
  54 #include "pme_grid.h"
  55
  56 /*
  57  * This define affects the spline calculation behaviour in the kernel.
  58  * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing
  59  * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task,
  60  * each one stores only a single theta and single dtheta into global arrays. The only efficiency
  61  * difference is less global store operations, countered by more redundant spline computation.
  62  *
  63  * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
  64  */
  65 #define PME_GPU_PARALLEL_SPLINE 0
  66
  67 /*! \brief
  68  * Charge spreading onto the grid.
  69  * This corresponds to the CPU function spread_coefficients_bsplines_thread().
  70  * Optional second stage of the spline_and_spread_kernel.
  71  *
  72  * \tparam     order                PME interpolation order.
  73  * \tparam     wrapX                Whether the grid overlap in dimension X should be wrapped.
  74  * \tparam     wrapY                Whether the grid overlap in dimension Y should be wrapped.
  75  * \tparam     gridIndex            The index of the grid to use in the kernel.
  76  * \tparam     threadsPerAtom       How many threads work on each atom
  77  *
  78  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  79  * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
  80  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  81  * \param[in]  sm_theta             Atom spline values in the shared memory.
  82  */
  83 template<int order, bool wrapX, bool wrapY, int gridIndex, ThreadsPerAtom threadsPerAtom>
  84 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
  85                                                const float*                 atomCharge,
  86                                                const int* __restrict__ sm_gridlineIndices,
  87                                                const float* __restrict__ sm_theta)
  88 {
  89     /* Global memory pointer to the output grid */
  90     float* __restrict__ gm_grid = kernelParams.grid.d_realGrid[gridIndex];
  91
  92     // Number of atoms processed by a single warp in spread and gather
  93     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
  94     const int atomsPerWarp        = warp_size / threadsPerAtomValue;
  95
  96     const int nx  = kernelParams.grid.realGridSize[XX];
  97     const int ny  = kernelParams.grid.realGridSize[YY];
  98     const int nz  = kernelParams.grid.realGridSize[ZZ];
  99     const int pny = kernelParams.grid.realGridSizePadded[YY];
 100     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 101
 102     const int offx = 0, offy = 0, offz = 0; // unused for now
 103
 104     const int atomIndexLocal = threadIdx.z;
 105
 106     const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
 107     if (chargeCheck)
 108     {
 109         // Spline Z coordinates
 110         const int ithz = threadIdx.x;
 111
 112         const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
 113         const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
 114         int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
 115         if (iz >= nz)
 116         {
 117             iz -= nz;
 118         }
 119         /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
 120         const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 121         /* Warp index w.r.t. block - could probably be obtained easier? */
 122         const int warpIndex = atomIndexLocal / atomsPerWarp;
 123
 124         const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
 125         const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
 126         const float thetaZ     = sm_theta[splineIndexZ];
 127
 128         /* loop not used if order*order threads per atom */
 129         const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
 130         const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
 131         for (int ithy = ithyMin; ithy < ithyMax; ithy++)
 132         {
 133             int iy = iyBase + ithy;
 134             if (wrapY & (iy >= ny))
 135             {
 136                 iy -= ny;
 137             }
 138
 139             const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
 140             float       thetaY = sm_theta[splineIndexY];
 141             const float Val    = thetaZ * thetaY * (*atomCharge);
 142             assert(isfinite(Val));
 143             const int offset = iy * pnz + iz;
 144
 145 #pragma unroll
 146             for (int ithx = 0; (ithx < order); ithx++)
 147             {
 148                 int ix = ixBase + ithx;
 149                 if (wrapX & (ix >= nx))
 150                 {
 151                     ix -= nx;
 152                 }
 153                 const int gridIndexGlobal = ix * pny * pnz + offset;
 154                 const int splineIndexX =
 155                         getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
 156                 const float thetaX = sm_theta[splineIndexX];
 157                 assert(isfinite(thetaX));
 158                 assert(isfinite(gm_grid[gridIndexGlobal]));
 159                 atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
 160             }
 161         }
 162     }
 163 }
 164
 165 /*! \brief
 166  * A spline computation and charge spreading kernel function.
 167  *
 168  * Two tuning parameters can be used for additional performance. For small systems and for debugging
 169  * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
 170  * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
 171  *
 172  * \tparam     order                PME interpolation order.
 173  * \tparam     computeSplines       A boolean which tells if the spline parameter and
 174  *                                  gridline indices' computation should be performed.
 175  * \tparam     spreadCharges        A boolean which tells if the charge spreading should be performed.
 176  * \tparam     wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
 177  * \tparam     wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
 178  * \tparam     numGrids             The number of grids to use in the kernel. Can be 1 or 2.
 179  * \tparam     writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
 180  * \tparam     threadsPerAtom       How many threads work on each atom
 181  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
 182  */
 183 template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
 184 __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
 185         void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
 186 {
 187     const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
 188     const int atomsPerBlock       = c_spreadMaxThreadsPerBlock / threadsPerAtomValue;
 189     // Number of atoms processed by a single warp in spread and gather
 190     const int atomsPerWarp = warp_size / threadsPerAtomValue;
 191     // Gridline indices, ivec
 192     __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
 193     // Charges
 194     __shared__ float sm_coefficients[atomsPerBlock];
 195     // Spline values
 196     __shared__ float sm_theta[atomsPerBlock * DIM * order];
 197     float            dtheta;
 198
 199     float3 atomX;
 200     float  atomCharge;
 201
 202     const int blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
 203     const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart;
 204
 205     /* Thread index w.r.t. block */
 206     const int threadLocalId =
 207             (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
 208     /* Warp index w.r.t. block - could probably be obtained easier? */
 209     const int warpIndex = threadLocalId / warp_size;
 210
 211     /* Atom index w.r.t. warp */
 212     const int atomWarpIndex = threadIdx.z % atomsPerWarp;
 213     /* Atom index w.r.t. block/shared memory */
 214     const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
 215     /* Atom index w.r.t. global memory */
 216     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 217
 218     /* Early return for fully empty blocks at the end
 219      * (should only happen for billions of input atoms)
 220      */
 221     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
 222     {
 223         return;
 224     }
 225     /* Charges, required for both spline and spread */
 226     if (c_useAtomDataPrefetch)
 227     {
 228         pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(
 229                 sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]);
 230         __syncthreads();
 231         atomCharge = sm_coefficients[atomIndexLocal];
 232     }
 233     else
 234     {
 235         atomCharge = kernelParams.atoms.d_coefficients[0][atomIndexGlobal];
 236     }
 237
 238     if (computeSplines)
 239     {
 240         const float3* __restrict__ gm_coordinates =
 241                 asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]);
 242         if (c_useAtomDataPrefetch)
 243         {
 244             // Coordinates
 245             __shared__ float3 sm_coordinates[atomsPerBlock];
 246
 247             /* Staging coordinates */
 248             pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
 249             __syncthreads();
 250             atomX = sm_coordinates[atomIndexLocal];
 251         }
 252         else
 253         {
 254             atomX = gm_coordinates[atomIndexGlobal];
 255         }
 256         calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal, numGrids>(
 257                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
 258         __syncwarp();
 259     }
 260     else
 261     {
 262         /* Staging the data for spread
 263          * (the data is assumed to be in GPU global memory with proper layout already,
 264          * as in after running the spline kernel)
 265          */
 266         /* Spline data - only thetas (dthetas will only be needed in gather) */
 267         pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
 268         /* Gridline indices */
 269         pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
 270                                                          kernelParams.atoms.d_gridlineIndices);
 271
 272         __syncthreads();
 273     }
 274
 275     /* Spreading */
 276     if (spreadCharges)
 277     {
 278
 279         if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
 280         {
 281             spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
 282                     kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
 283         }
 284     }
 285     if (numGrids == 2)
 286     {
 287         __syncthreads();
 288         if (c_useAtomDataPrefetch)
 289         {
 290             pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
 291                                                              kernelParams.atoms.d_coefficients[1]);
 292             __syncthreads();
 293             atomCharge = sm_coefficients[atomIndexLocal];
 294         }
 295         else
 296         {
 297             atomCharge = kernelParams.atoms.d_coefficients[1][atomIndexGlobal];
 298         }
 299         if (spreadCharges)
 300         {
 301             if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
 302             {
 303                 spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
 304                         kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
 305             }
 306         }
 307     }
 308 }
 309
 310 //! Kernel instantiations
 311 // clang-format off
 312 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
 313 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 314 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 315 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 316 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
 317 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 318 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 319 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 320 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::Order>        (const PmeGpuCudaKernelParams);
 321 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 322 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 323 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::Order>       (const PmeGpuCudaKernelParams);
 324 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
 325 template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 326 template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 327 template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 328 // clang-format on