src/gromacs/ewald/pme_gather.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME OpenCL force gathering kernel.
  38  * When including this and other PME OpenCL kernel files, plenty of common
  39  * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
  40  * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
  41  *
  42  * This file's kernels specifically expect the following definitions:
  43  *
  44  * - atomsPerBlock which expresses how many atoms are processed by a single work group
  45  * - order which is a PME interpolation order
  46  * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
  47  * in dimension X/Y is to be used
  48  *
  49  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  50  */
  51
  52 #include "pme_gpu_types.h"
  53 #include "pme_gpu_calculate_splines.clh"
  54
  55 #ifndef COMPILE_GATHER_HELPERS_ONCE
  56 #    define COMPILE_GATHER_HELPERS_ONCE
  57
  58 /*! \brief
  59  * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  60  */
  61 inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
  62 {
  63     switch (dimIndex)
  64     {
  65         case XX: return realGridSizeFP[XX];
  66         case YY: return realGridSizeFP[YY];
  67         case ZZ: return realGridSizeFP[ZZ];
  68         default: assert(false); break;
  69     }
  70     assert(false);
  71     return 0.0F;
  72 }
  73
  74 /*! \brief Reduce the partial force contributions.
  75  *
  76  * FIXME: this reduction should be simplified and improved, it does 3x16 force component
  77  *        reduction per 16 threads so no extra shared mem should be needed for intermediates
  78  *        or passing results back.
  79  *
  80  * \param[out]      sm_forces          Local memory array with the output forces (rvec).
  81  * \param[in]       atomIndexLocal     Local atom index
  82  * \param[in]       splineIndex        Spline index
  83  * \param[in]       lineIndex          Line index (same as threadLocalId)
  84  * \param[in]       realGridSizeFP     Local grid size constant
  85  * \param[in]       fx                 Input force partial component X
  86  * \param[in]       fy                 Input force partial component Y
  87  * \param[in]       fz                 Input force partial component Z
  88  * \param[in,out]   sm_forceReduction  Reduction working buffer
  89  * \param[in]       sm_forceTemp       Convenience pointers into \p sm_forceReduction
  90  */
  91 inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
  92                                const int    atomIndexLocal,
  93                                const int    splineIndex,
  94                                const int    lineIndex,
  95                                const float* realGridSizeFP,
  96                                float        fx,
  97                                float        fy,
  98                                float        fz,
  99                                __local float* __restrict__ sm_forceReduction,
 100                                __local float** __restrict__ sm_forceTemp)
 101
 102 {
 103     // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
 104
 105     /* Number of data components and threads for a single atom */
 106 #    define atomDataSize threadsPerAtom
 107     // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
 108     // All those guys are defines and not consts, because they go into the local memory array size.
 109 #    define blockSize (atomsPerBlock * atomDataSize)
 110 #    define smemPerDim warp_size
 111 #    define smemReserved (DIM * smemPerDim)
 112
 113     const int numWarps  = blockSize / smemPerDim;
 114     const int minStride = max(1, atomDataSize / numWarps);
 115
 116 #    pragma unroll DIM
 117     for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 118     {
 119         int elementIndex = smemReserved + lineIndex;
 120         // Store input force contributions
 121         sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 122 #    if (warp_size < 48)
 123         // sync here when exec width is smaller than the size of the sm_forceReduction
 124         // buffer flushed to local mem above (size 3*16) as different warps will consume
 125         // the data below.
 126         barrier(CLK_LOCAL_MEM_FENCE);
 127 #    endif
 128
 129         // Reduce to fit into smemPerDim (warp size)
 130 #    pragma unroll
 131         for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
 132         {
 133             if (splineIndex < redStride)
 134             {
 135                 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 136             }
 137         }
 138         barrier(CLK_LOCAL_MEM_FENCE);
 139         // Last iteration - packing everything to be nearby, storing convenience pointer
 140         sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 141         int redStride          = minStride;
 142         if (splineIndex < redStride)
 143         {
 144             const int packedIndex = atomIndexLocal * redStride + splineIndex;
 145             sm_forceTemp[dimIndex][packedIndex] =
 146                     sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 147         }
 148
 149         // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
 150 #    if (warp_size < 64)
 151         barrier(CLK_LOCAL_MEM_FENCE);
 152 #    endif
 153     }
 154
 155 #    if (warp_size >= 64)
 156     barrier(CLK_LOCAL_MEM_FENCE);
 157 #    endif
 158
 159     assert((blockSize / warp_size) >= DIM);
 160
 161     const int warpIndex = lineIndex / warp_size;
 162     const int dimIndex  = warpIndex;
 163
 164     // First 3 warps can now process 1 dimension each
 165     if (dimIndex < DIM)
 166     {
 167         const int sourceIndex = lineIndex % warp_size;
 168 #    pragma unroll
 169         for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
 170         {
 171             if (!(splineIndex & redStride))
 172             {
 173                 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 174             }
 175         }
 176
 177         const float n         = read_grid_size(realGridSizeFP, dimIndex);
 178         const int   atomIndex = sourceIndex / minStride;
 179         if (sourceIndex == minStride * atomIndex)
 180         {
 181             sm_forces[atomIndex * DIM + dimIndex] =
 182                     (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 183         }
 184     }
 185 }
 186
 187 #endif // COMPILE_GATHER_HELPERS_ONCE
 188
 189 /*! \brief
 190  * An OpenCL kernel which gathers the atom forces from the grid.
 191  * The grid is assumed to be wrapped in dimension Z.
 192  * Please see the file description for additional defines which this kernel expects.
 193  *
 194  * \param[in]     kernelParams         All the PME GPU data.
 195  * \param[in]     gm_coefficients      Atom charges/coefficients.
 196  * \param[in]     gm_grid              Global 3D grid.
 197  * \param[in]     gm_theta             Atom spline parameter values
 198  * \param[in]     gm_dtheta            Atom spline parameter derivatives
 199  * \param[in]     gm_gridlineIndices   Atom gridline indices (ivec)
 200  * \param[in,out] gm_forces            Atom forces (rvec)
 201  */
 202 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
 203 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
 204                                                         __global const float* __restrict__ gm_coefficients,
 205                                                         __global const float* __restrict__ gm_grid,
 206                                                         __global const float* __restrict__ gm_theta,
 207                                                         __global const float* __restrict__ gm_dtheta,
 208                                                         __global const int* __restrict__ gm_gridlineIndices,
 209                                                         __global float* __restrict__ gm_forces)
 210 {
 211     /* These are the atom indices - for the shared and global memory */
 212     const int atomIndexLocal  = get_local_id(ZZ);
 213     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
 214     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 215
 216 /* Some sizes which are defines and not consts because they go into the array size */
 217 #define blockSize (atomsPerBlock * atomDataSize)
 218     assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
 219 #define smemPerDim warp_size
 220 #define smemReserved (DIM * smemPerDim)
 221 #define totalSharedMemory (smemReserved + blockSize)
 222 #define gridlineIndicesSize (atomsPerBlock * DIM)
 223 #define splineParamsSize (atomsPerBlock * DIM * order)
 224
 225     __local int sm_gridlineIndices[gridlineIndicesSize];
 226     __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
 227
 228     /* Spline Y/Z coordinates */
 229     const int ithy = get_local_id(YY);
 230     const int ithz = get_local_id(XX);
 231
 232     assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
 233            <= MAX_INT);
 234     const int threadLocalId =
 235             (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
 236                   + get_local_id(0));
 237
 238     /* These are the spline contribution indices in shared memory */
 239     assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
 240     const int splineIndex =
 241             (int)(get_local_id(1) * get_local_size(0)
 242                   + get_local_id(0));    /* Relative to the current particle , 0..15 for order 4 */
 243     const int lineIndex = threadLocalId; /* And to all the block's particles */
 244
 245     /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
 246     const int localGridlineIndicesIndex = threadLocalId;
 247     const int globalGridlineIndicesIndex =
 248             (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
 249     if (localGridlineIndicesIndex < gridlineIndicesSize)
 250     {
 251         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 252         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 253     }
 254     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
 255     const int localSplineParamsIndex = threadLocalId;
 256     const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
 257     if (localSplineParamsIndex < splineParamsSize)
 258     {
 259         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
 260         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
 261         assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
 262         assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
 263     }
 264     barrier(CLK_LOCAL_MEM_FENCE);
 265
 266     float fx = 0.0F;
 267     float fy = 0.0F;
 268     float fz = 0.0F;
 269
 270     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
 271
 272     if (chargeCheck)
 273     {
 274         const int nx  = kernelParams.grid.realGridSize[XX];
 275         const int ny  = kernelParams.grid.realGridSize[YY];
 276         const int nz  = kernelParams.grid.realGridSize[ZZ];
 277         const int pny = kernelParams.grid.realGridSizePadded[YY];
 278         const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 279
 280         const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 281         const int warpIndex     = atomIndexLocal / atomsPerWarp;
 282
 283         const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
 284         const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
 285         const float2 tdy             = sm_splineParams[splineIndexY];
 286         const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
 287         const float2 tdz             = sm_splineParams[splineIndexZ];
 288
 289         const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 290         int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 291         if (wrapY & (iy >= ny))
 292         {
 293             iy -= ny;
 294         }
 295         int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 296         if (iz >= nz)
 297         {
 298             iz -= nz;
 299         }
 300         const int constOffset = iy * pnz + iz;
 301
 302 #pragma unroll order
 303         for (int ithx = 0; (ithx < order); ithx++)
 304         {
 305             int ix = ixBase + ithx;
 306             if (wrapX & (ix >= nx))
 307             {
 308                 ix -= nx;
 309             }
 310             const int gridIndexGlobal = ix * pny * pnz + constOffset;
 311             assert(gridIndexGlobal >= 0);
 312             const float gridValue = gm_grid[gridIndexGlobal];
 313             assert(isfinite(gridValue));
 314             const int    splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
 315             const float2 tdx          = sm_splineParams[splineIndexX];
 316             const float  fxy1         = tdz.x * gridValue;
 317             const float  fz1          = tdz.y * gridValue;
 318             fx += tdx.y * tdy.x * fxy1;
 319             fy += tdx.x * tdy.y * fxy1;
 320             fz += tdx.x * tdy.x * fz1;
 321         }
 322     }
 323
 324     // Reduction of partial force contributions
 325     __local float sm_forces[atomsPerBlock * DIM];
 326
 327     __local float  sm_forceReduction[totalSharedMemory];
 328     __local float* sm_forceTemp[DIM];
 329
 330     reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex,
 331                        kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp);
 332     barrier(CLK_LOCAL_MEM_FENCE);
 333
 334     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 335     const int forceIndexLocal  = threadLocalId;
 336     const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 337     if (forceIndexLocal < atomsPerBlock)
 338     {
 339         const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
 340         const float  negCoefficient = -gm_coefficients[forceIndexGlobal];
 341         float3       result;
 342         result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
 343         result.y = negCoefficient
 344                    * (kernelParams.current.recipBox[XX][YY] * atomForces.x
 345                       + kernelParams.current.recipBox[YY][YY] * atomForces.y);
 346         result.z = negCoefficient
 347                    * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
 348                       + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
 349                       + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
 350         vstore3(result, forceIndexLocal, sm_forces);
 351     }
 352
 353 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
 354     /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
 355      * __syncwarp() in CUDA. #2519
 356      */
 357     barrier(CLK_LOCAL_MEM_FENCE);
 358 #endif
 359
 360     assert(atomsPerBlock <= warp_size);
 361
 362     /* Writing or adding the final forces component-wise, single warp */
 363     const int blockForcesSize = atomsPerBlock * DIM;
 364     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 365     const int iterThreads     = blockForcesSize / numIter;
 366     if (threadLocalId < iterThreads)
 367     {
 368 #pragma unroll
 369         for (int i = 0; i < numIter; i++)
 370         {
 371             const int outputIndexLocal  = i * iterThreads + threadLocalId;
 372             const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
 373             const float outputForceComponent = sm_forces[outputIndexLocal];
 374             gm_forces[outputIndexGlobal]     = outputForceComponent;
 375         }
 376     }
 377 }