src/gromacs/ewald/pme_gather.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME OpenCL force gathering kernel.
  38  * When including this and other PME OpenCL kernel files, plenty of common
  39  * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
  40  * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
  41  *
  42  * This file's kernels specifically expect the following definitions:
  43  *
  44  * - atomsPerBlock which expresses how many atoms are processed by a single work group
  45  * - order which is a PME interpolation order
  46  * - overwriteForces must evaluate to either true or false to specify whether the kernel
  47  * overwrites or reduces into the forces buffer
  48  * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
  49  * in dimension X/Y is to be used
  50  *
  51  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  52  */
  53
  54 #include "pme_gpu_types.h"
  55 #include "pme_gpu_utils.clh"
  56
  57 #ifndef COMPILE_GATHER_HELPERS_ONCE
  58 #define COMPILE_GATHER_HELPERS_ONCE
  59
  60 /*! \brief
  61  * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  62  */
  63 inline float read_grid_size(const float *realGridSizeFP,
  64                             const int    dimIndex)
  65 {
  66     switch (dimIndex)
  67     {
  68         case XX: return realGridSizeFP[XX];
  69         case YY: return realGridSizeFP[YY];
  70         case ZZ: return realGridSizeFP[ZZ];
  71     }
  72     assert(false);
  73     return 0.0f;
  74 }
  75
  76 /*! \brief Reduce the partial force contributions.
  77  *
  78  * FIXME: this reduction should be simplified and improved, it does 3x16 force component
  79  *        reduction per 16 threads so no extra shared mem should be needed for intermediates
  80  *        or passing results back.
  81  *
  82  * \param[out]      sm_forces          Local memory array with the output forces (rvec).
  83  * \param[in]       atomIndexLocal     Local atom index
  84  * \param[in]       splineIndex        Spline index
  85  * \param[in]       lineIndex          Line index (same as threadLocalId)
  86  * \param[in]       realGridSizeFP     Local grid size constant
  87  * \param[in]       fx                 Input force partial component X
  88  * \param[in]       fy                 Input force partial component Y
  89  * \param[in]       fz                 Input force partial component Z
  90  * \param[in,out]   sm_forceReduction  Reduction working buffer
  91  * \param[in]       sm_forceTemp       Convenience pointers into \p sm_forceReduction
  92  */
  93 inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
  94                                const int                     atomIndexLocal,
  95                                const int                     splineIndex,
  96                                const int                     lineIndex,
  97                                const float                  *realGridSizeFP,
  98                                float                         fx,
  99                                float                         fy,
 100                                float                         fz,
 101                                __local float * __restrict__  sm_forceReduction,
 102                                __local float ** __restrict__ sm_forceTemp
 103                                )
 104
 105 {
 106     // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
 107
 108     /* Number of data components and threads for a single atom */
 109 #define atomDataSize threadsPerAtom
 110     // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
 111     // All those guys are defines and not consts, because they go into the local memory array size.
 112 #define blockSize (atomsPerBlock * atomDataSize)
 113 #define smemPerDim warp_size
 114 #define smemReserved  (DIM * smemPerDim)
 115
 116     const int numWarps  = blockSize / smemPerDim;
 117     const int minStride = max(1, atomDataSize / numWarps);
 118
 119 #pragma unroll DIM
 120     for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 121     {
 122         int elementIndex = smemReserved + lineIndex;
 123         // Store input force contributions
 124         sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 125
 126 #if !defined(_AMD_SOURCE_)
 127         /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
 128          * Different OpenCL compilers might have different ideas
 129          * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
 130          * #2519
 131          */
 132         barrier(CLK_LOCAL_MEM_FENCE);
 133 #endif
 134
 135         // Reduce to fit into smemPerDim (warp size)
 136 #pragma unroll
 137         for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
 138         {
 139             if (splineIndex < redStride)
 140             {
 141                 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 142             }
 143         }
 144         barrier(CLK_LOCAL_MEM_FENCE);
 145         // Last iteration - packing everything to be nearby, storing convenience pointer
 146         sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 147         int redStride = minStride;
 148         if (splineIndex < redStride)
 149         {
 150             const int packedIndex = atomIndexLocal * redStride + splineIndex;
 151             sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 152         }
 153     }
 154
 155     barrier(CLK_LOCAL_MEM_FENCE);
 156
 157     assert ((blockSize / warp_size) >= DIM);
 158
 159     const int warpIndex = lineIndex / warp_size;
 160     const int dimIndex  = warpIndex;
 161
 162     // First 3 warps can now process 1 dimension each
 163     if (dimIndex < DIM)
 164     {
 165         int sourceIndex = lineIndex % warp_size;
 166 #pragma unroll
 167         for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
 168         {
 169             if (!(splineIndex & redStride))
 170             {
 171                 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 172             }
 173         }
 174
 175         const float n = read_grid_size(realGridSizeFP, dimIndex);
 176
 177         const int   atomIndex = sourceIndex / minStride;
 178         if (sourceIndex == minStride * atomIndex)
 179         {
 180             sm_forces[atomIndex * DIM + dimIndex] = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 181         }
 182     }
 183 }
 184
 185 #endif //COMPILE_GATHER_HELPERS_ONCE
 186
 187 /*! \brief
 188  * An OpenCL kernel which gathers the atom forces from the grid.
 189  * The grid is assumed to be wrapped in dimension Z.
 190  * Please see the file description for additional defines which this kernel expects.
 191  *
 192  * \param[in]     kernelParams         All the PME GPU data.
 193  * \param[in]     gm_coefficients      Atom charges/coefficients.
 194  * \param[in]     gm_grid              Global 3D grid.
 195  * \param[in]     gm_theta             Atom spline parameter values
 196  * \param[in]     gm_dtheta            Atom spline parameter derivatives
 197  * \param[in]     gm_gridlineIndices   Atom gridline indices (ivec)
 198  * \param[in,out] gm_forces            Atom forces (rvec)
 199  */
 200 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
 201 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams   kernelParams,
 202                                                         __global const float * __restrict__  gm_coefficients,
 203                                                         __global const float * __restrict__  gm_grid,
 204                                                         __global const float * __restrict__  gm_theta,
 205                                                         __global const float * __restrict__  gm_dtheta,
 206                                                         __global const int * __restrict__    gm_gridlineIndices,
 207                                                         __global float * __restrict__        gm_forces
 208                                                         )
 209 {
 210     /* These are the atom indices - for the shared and global memory */
 211     const int         atomIndexLocal    = get_local_id(ZZ);
 212     const int         atomIndexOffset   = get_group_id(XX) * atomsPerBlock;
 213     const int         atomIndexGlobal   = atomIndexOffset + atomIndexLocal;
 214
 215     /* Some sizes which are defines and not consts because they go into the array size */
 216     #define blockSize (atomsPerBlock * atomDataSize)
 217     assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
 218     #define smemPerDim warp_size
 219     #define smemReserved  (DIM * smemPerDim)
 220     #define totalSharedMemory (smemReserved + blockSize)
 221     #define gridlineIndicesSize (atomsPerBlock * DIM)
 222     #define splineParamsSize (atomsPerBlock * DIM * order)
 223
 224     __local int    sm_gridlineIndices[gridlineIndicesSize];
 225     __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
 226
 227     /* Spline Y/Z coordinates */
 228     const int ithy = get_local_id(YY);
 229     const int ithz = get_local_id(XX);
 230
 231     const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0);
 232
 233     /* These are the spline contribution indices in shared memory */
 234     const int splineIndex = (get_local_id(1) * get_local_size(0) + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
 235     const int lineIndex   = threadLocalId;                                           /* And to all the block's particles */
 236
 237     /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
 238     const int localGridlineIndicesIndex  = threadLocalId;
 239     const int globalGridlineIndicesIndex = get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
 240     const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
 241     if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
 242     {
 243         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 244         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 245     }
 246     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
 247     const int localSplineParamsIndex  = threadLocalId;
 248     const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
 249     const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
 250     if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
 251     {
 252         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
 253         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
 254         assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
 255         assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
 256     }
 257     barrier(CLK_LOCAL_MEM_FENCE);
 258
 259     float           fx = 0.0f;
 260     float           fy = 0.0f;
 261     float           fz = 0.0f;
 262
 263     const int       globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 264     const int       chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 265
 266     if (chargeCheck & globalCheck)
 267     {
 268         const int    nx        = kernelParams.grid.realGridSize[XX];
 269         const int    ny        = kernelParams.grid.realGridSize[YY];
 270         const int    nz        = kernelParams.grid.realGridSize[ZZ];
 271         const int    pny       = kernelParams.grid.realGridSizePadded[YY];
 272         const int    pnz       = kernelParams.grid.realGridSizePadded[ZZ];
 273
 274         const int    atomWarpIndex = atomIndexLocal % atomsPerWarp;
 275         const int    warpIndex     = atomIndexLocal / atomsPerWarp;
 276
 277         const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
 278         const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
 279         const float2 tdy             = sm_splineParams[splineIndexY];
 280         const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
 281         const float2 tdz             = sm_splineParams[splineIndexZ];
 282
 283         const int    ixBase         = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 284         int          iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 285         if (wrapY & (iy >= ny))
 286         {
 287             iy -= ny;
 288         }
 289         int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 290         if (iz >= nz)
 291         {
 292             iz -= nz;
 293         }
 294         const int constOffset    = iy * pnz + iz;
 295
 296 #pragma unroll order
 297         for (int ithx = 0; (ithx < order); ithx++)
 298         {
 299             int ix = ixBase + ithx;
 300             if (wrapX & (ix >= nx))
 301             {
 302                 ix -= nx;
 303             }
 304             const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
 305             assert(gridIndexGlobal >= 0);
 306             const float   gridValue    = gm_grid[gridIndexGlobal];
 307             assert(isfinite(gridValue));
 308             const int     splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
 309             const float2  tdx          = sm_splineParams[splineIndexX];
 310             const float   fxy1         = tdz.x * gridValue;
 311             const float   fz1          = tdz.y * gridValue;
 312             fx += tdx.y * tdy.x * fxy1;
 313             fy += tdx.x * tdy.y * fxy1;
 314             fz += tdx.x * tdy.x * fz1;
 315         }
 316     }
 317
 318     // Reduction of partial force contributions
 319     __local float  sm_forces[atomsPerBlock * DIM];
 320
 321     __local float  sm_forceReduction[totalSharedMemory];
 322     __local float *sm_forceTemp[DIM];
 323
 324     reduce_atom_forces(sm_forces,
 325                        atomIndexLocal, splineIndex, lineIndex,
 326                        kernelParams.grid.realGridSizeFP,
 327                        fx, fy, fz,
 328                        sm_forceReduction,
 329                        sm_forceTemp);
 330     barrier(CLK_LOCAL_MEM_FENCE);
 331
 332     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 333     const int forceIndexLocal  = threadLocalId;
 334     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 335     const int calcIndexCheck   = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
 336     if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
 337     {
 338         const float3  atomForces     = vload3(forceIndexLocal, sm_forces);
 339         const float   negCoefficient = -gm_coefficients[forceIndexGlobal];
 340         float3        result;
 341         result.x      = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
 342         result.y      = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x +
 343                                           kernelParams.current.recipBox[YY][YY] * atomForces.y);
 344         result.z      = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x +
 345                                           kernelParams.current.recipBox[YY][ZZ] * atomForces.y +
 346                                           kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
 347         vstore3(result, forceIndexLocal, sm_forces);
 348     }
 349
 350 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
 351     /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
 352      * #2519
 353      */
 354     barrier(CLK_LOCAL_MEM_FENCE);
 355 #endif
 356
 357     assert(atomsPerBlock <= warp_size);
 358
 359     /* Writing or adding the final forces component-wise, single warp */
 360     const int blockForcesSize = atomsPerBlock * DIM;
 361     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 362     const int iterThreads     = blockForcesSize / numIter;
 363     if (threadLocalId < iterThreads)
 364     {
 365 #pragma unroll
 366         for (int i = 0; i < numIter; i++)
 367         {
 368             const int outputIndexLocal  = i * iterThreads + threadLocalId;
 369             const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
 370             const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
 371             if (globalOutputCheck)
 372             {
 373                 const float outputForceComponent = sm_forces[outputIndexLocal];
 374                 if (overwriteForces)
 375                 {
 376                     gm_forces[outputIndexGlobal] = outputForceComponent;
 377                 }
 378                 else
 379                 {
 380                     gm_forces[outputIndexGlobal] += outputForceComponent;
 381                 }
 382             }
 383         }
 384     }
 385 }