src/gromacs/ewald/pme_gather.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME OpenCL force gathering kernel.
  38  * When including this and other PME OpenCL kernel files, plenty of common
  39  * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
  40  * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
  41  *
  42  * This file's kernels specifically expect the following definitions:
  43  *
  44  * - atomsPerBlock which expresses how many atoms are processed by a single work group
  45  * - order which is a PME interpolation order
  46  * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
  47  * in dimension X/Y is to be used
  48  *
  49  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  50  */
  51
  52 #include "pme_gpu_calculate_splines.clh"
  53 #include "pme_gpu_types.h"
  54
  55 #ifndef COMPILE_GATHER_HELPERS_ONCE
  56 #    define COMPILE_GATHER_HELPERS_ONCE
  57
  58 /*! \brief
  59  * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
  60  */
  61 inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
  62 {
  63     switch (dimIndex)
  64     {
  65         case XX: return realGridSizeFP[XX];
  66         case YY: return realGridSizeFP[YY];
  67         case ZZ: return realGridSizeFP[ZZ];
  68         default: assert(false); break;
  69     }
  70     assert(false);
  71     return 0.0F;
  72 }
  73
  74 /*! \brief Reduce the partial force contributions.
  75  *
  76  * FIXME: this reduction should be simplified and improved, it does 3x16 force component
  77  *        reduction per 16 threads so no extra shared mem should be needed for intermediates
  78  *        or passing results back.
  79  *
  80  * \param[out]      sm_forces          Local memory array with the output forces (rvec).
  81  * \param[in]       atomIndexLocal     Local atom index
  82  * \param[in]       splineIndex        Spline index
  83  * \param[in]       lineIndex          Line index (same as threadLocalId)
  84  * \param[in]       realGridSizeFP     Local grid size constant
  85  * \param[in]       fx                 Input force partial component X
  86  * \param[in]       fy                 Input force partial component Y
  87  * \param[in]       fz                 Input force partial component Z
  88  * \param[in,out]   sm_forceReduction  Reduction working buffer
  89  * \param[in]       sm_forceTemp       Convenience pointers into \p sm_forceReduction
  90  */
  91 inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
  92                                const int    atomIndexLocal,
  93                                const int    splineIndex,
  94                                const int    lineIndex,
  95                                const float* realGridSizeFP,
  96                                float        fx,
  97                                float        fy,
  98                                float        fz,
  99                                __local float* __restrict__ sm_forceReduction,
 100                                __local float** __restrict__ sm_forceTemp)
 101
 102 {
 103     // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
 104
 105     /* Number of data components and threads for a single atom */
 106 #    define atomDataSize threadsPerAtom
 107     // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
 108     // All those guys are defines and not consts, because they go into the local memory array size.
 109 #    define blockSize (atomsPerBlock * atomDataSize)
 110 #    define smemPerDim warp_size
 111 #    define smemReserved (DIM * smemPerDim)
 112
 113     const int numWarps  = blockSize / smemPerDim;
 114     const int minStride = max(1, atomDataSize / numWarps);
 115
 116 #    pragma unroll DIM
 117     for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
 118     {
 119         int elementIndex = smemReserved + lineIndex;
 120         // Store input force contributions
 121         sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
 122 #    if (warp_size < 48)
 123         // sync here when exec width is smaller than the size of the sm_forceReduction
 124         // buffer flushed to local mem above (size 3*16) as different warps will consume
 125         // the data below.
 126         barrier(CLK_LOCAL_MEM_FENCE);
 127 #    endif
 128
 129         // Reduce to fit into smemPerDim (warp size)
 130 #    pragma unroll
 131         for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
 132         {
 133             if (splineIndex < redStride)
 134             {
 135                 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
 136             }
 137         }
 138         barrier(CLK_LOCAL_MEM_FENCE);
 139         // Last iteration - packing everything to be nearby, storing convenience pointer
 140         sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
 141         int redStride          = minStride;
 142         if (splineIndex < redStride)
 143         {
 144             const int packedIndex = atomIndexLocal * redStride + splineIndex;
 145             sm_forceTemp[dimIndex][packedIndex] =
 146                     sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
 147         }
 148
 149         // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
 150 #    if (warp_size < 64)
 151         barrier(CLK_LOCAL_MEM_FENCE);
 152 #    endif
 153     }
 154
 155 #    if (warp_size >= 64)
 156     barrier(CLK_LOCAL_MEM_FENCE);
 157 #    endif
 158
 159     assert((blockSize / warp_size) >= DIM);
 160
 161     const int warpIndex = lineIndex / warp_size;
 162     const int dimIndex  = warpIndex;
 163
 164     // First 3 warps can now process 1 dimension each
 165     if (dimIndex < DIM)
 166     {
 167         const int sourceIndex = lineIndex % warp_size;
 168 #    pragma unroll
 169         for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
 170         {
 171             if (!(splineIndex & redStride))
 172             {
 173                 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
 174             }
 175         }
 176
 177         const float n         = read_grid_size(realGridSizeFP, dimIndex);
 178         const int   atomIndex = sourceIndex / minStride;
 179         if (sourceIndex == minStride * atomIndex)
 180         {
 181             sm_forces[atomIndex * DIM + dimIndex] =
 182                     (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
 183         }
 184     }
 185 }
 186
 187 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
 188  *
 189  * \param[out] fx                 The force partial component in the X dimension.
 190  * \param[out] fy                 The force partial component in the Y dimension.
 191  * \param[out] fz                 The force partial component in the Z dimension.
 192  * \param[in] ixBase              The grid line index base value in the X dimension.
 193  * \param[in] nx                  The grid real size in the X dimension.
 194  * \param[in] pny                 The padded grid real size in the Y dimension.
 195  * \param[in] pnz                 The padded grid real size in the Z dimension.
 196  * \param[in] constOffset         The offset to calculate the global grid index.
 197  * \param[in] splineIndexBase     The base value of the spline parameter index.
 198  * \param[in] tdy                 The theta and dtheta in the Y dimension.
 199  * \param[in] tdz                 The theta and dtheta in the Z dimension.
 200  * \param[in] sm_splineParams     Shared memory array of spline parameters.
 201  * \param[in] gm_grid             Global memory array of the grid to use.
 202  */
 203 inline void sumForceComponents(float*        fx,
 204                                float*        fy,
 205                                float*        fz,
 206                                const int     ixBase,
 207                                const int     nx,
 208                                const int     pny,
 209                                const int     pnz,
 210                                const int     constOffset,
 211                                const int     splineIndexBase,
 212                                const float2  tdy,
 213                                const float2  tdz,
 214                                __local const float2* __restrict__ sm_splineParams,
 215                                __global const float* __restrict__ gm_grid)
 216 {
 217 #    pragma unroll order
 218     for (int ithx = 0; (ithx < order); ithx++)
 219     {
 220         int ix = ixBase + ithx;
 221         if (wrapX & (ix >= nx))
 222         {
 223             ix -= nx;
 224         }
 225         const int gridIndexGlobal = ix * pny * pnz + constOffset;
 226         assert(gridIndexGlobal >= 0);
 227         const float gridValue = gm_grid[gridIndexGlobal];
 228         assert(isfinite(gridValue));
 229         const int    splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
 230         const float2 tdx          = sm_splineParams[splineIndexX];
 231         const float  fxy1         = tdz.x * gridValue;
 232         const float  fz1          = tdz.y * gridValue;
 233         *fx += tdx.y * tdy.x * fxy1;
 234         *fy += tdx.x * tdy.y * fxy1;
 235         *fz += tdx.x * tdy.x * fz1;
 236     }
 237 }
 238
 239 /*! \brief Calculate the grid forces and store them in shared memory.
 240  *
 241  * \param[in,out] sm_forces       Shared memory array with the output forces.
 242  * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
 243  * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
 244  * \param[in] recipBox            The reciprocal box.
 245  * \param[in] scale               The scale to use when calculating the forces (only relevant when
 246  * using two grids).
 247  * \param[in] gm_coefficients     Global memory array of the coefficients to use.
 248  */
 249 inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces,
 250                                         const int   forceIndexLocal,
 251                                         const int   forceIndexGlobal,
 252                                         const float recipBox[DIM][DIM],
 253                                         const float scale,
 254                                         __global const float* __restrict__ gm_coefficients)
 255 {
 256     const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
 257     float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
 258     float3       result;
 259     result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
 260     result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
 261     result.z = negCoefficient
 262                * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
 263                   + recipBox[ZZ][ZZ] * atomForces.z);
 264     vstore3(result, forceIndexLocal, sm_forces);
 265 }
 266
 267 #endif // COMPILE_GATHER_HELPERS_ONCE
 268
 269 /*! \brief
 270  * An OpenCL kernel which gathers the atom forces from the grid.
 271  * The grid is assumed to be wrapped in dimension Z.
 272  * Please see the file description for additional defines which this kernel expects.
 273  *
 274  * \param[in]     kernelParams         All the PME GPU data.
 275  * \param[in]     gm_coefficientsA     Atom charges/coefficients in the unperturbed state, or FEP
 276  * state A.
 277  * \param[in]     gm_coefficientsB     Atom charges/coefficients in FEP state B. Only used
 278  * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two
 279  * separate grids.
 280  * \param[in]     gm_gridA             Global 3D grid for the unperturbed state, FEP
 281  * state A or the single grid used for interpolated coefficients on one grid in FEP A/B.
 282  * \param[in]     gm_gridB             Global 3D grid for FEP state B when using dual
 283  * grids (when calculating energy and virials).
 284  * \param[in]     gm_theta             Atom spline parameter values
 285  * \param[in]     gm_dtheta            Atom spline parameter derivatives
 286  * \param[in]     gm_gridlineIndices   Atom gridline indices (ivec)
 287  * \param[in,out] gm_forces            Atom forces (rvec)
 288  */
 289 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
 290 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
 291                                                         __global const float* __restrict__ gm_coefficientsA,
 292                                                         __global const float* __restrict__ gm_coefficientsB,
 293                                                         __global const float* __restrict__ gm_gridA,
 294                                                         __global const float* __restrict__ gm_gridB,
 295                                                         __global const float* __restrict__ gm_theta,
 296                                                         __global const float* __restrict__ gm_dtheta,
 297                                                         __global const int* __restrict__ gm_gridlineIndices,
 298                                                         __global float* __restrict__ gm_forces)
 299 {
 300     assert(numGrids == 1 || numGrids == 2);
 301
 302     /* These are the atom indices - for the shared and global memory */
 303     const int atomIndexLocal  = get_local_id(ZZ);
 304     const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
 305     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 306
 307 /* Some sizes which are defines and not consts because they go into the array size */
 308 #define blockSize (atomsPerBlock * atomDataSize)
 309     assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
 310 #define smemPerDim warp_size
 311 #define smemReserved (DIM * smemPerDim)
 312 #define totalSharedMemory (smemReserved + blockSize)
 313 #define gridlineIndicesSize (atomsPerBlock * DIM)
 314 #define splineParamsSize (atomsPerBlock * DIM * order)
 315
 316     __local int    sm_gridlineIndices[gridlineIndicesSize];
 317     __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
 318
 319     /* Spline Y/Z coordinates */
 320     const int ithy = get_local_id(YY);
 321     const int ithz = get_local_id(XX);
 322
 323     assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
 324            <= MAX_INT);
 325     const int threadLocalId =
 326             (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
 327                   + get_local_id(0));
 328
 329     /* These are the spline contribution indices in shared memory */
 330     assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
 331     const int splineIndex =
 332             (int)(get_local_id(1) * get_local_size(0)
 333                   + get_local_id(0));    /* Relative to the current particle , 0..15 for order 4 */
 334     const int lineIndex = threadLocalId; /* And to all the block's particles */
 335
 336     /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
 337     const int localGridlineIndicesIndex = threadLocalId;
 338     const int globalGridlineIndicesIndex =
 339             (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
 340     if (localGridlineIndicesIndex < gridlineIndicesSize)
 341     {
 342         sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
 343         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
 344     }
 345     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
 346     const int localSplineParamsIndex = threadLocalId;
 347     const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
 348     if (localSplineParamsIndex < splineParamsSize)
 349     {
 350         sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
 351         sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
 352         assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
 353         assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
 354     }
 355     barrier(CLK_LOCAL_MEM_FENCE);
 356
 357     float fx = 0.0F;
 358     float fy = 0.0F;
 359     float fz = 0.0F;
 360
 361     int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
 362
 363     const int nx  = kernelParams.grid.realGridSize[XX];
 364     const int ny  = kernelParams.grid.realGridSize[YY];
 365     const int nz  = kernelParams.grid.realGridSize[ZZ];
 366     const int pny = kernelParams.grid.realGridSizePadded[YY];
 367     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 368
 369     const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
 370     const int warpIndex     = atomIndexLocal / atomsPerWarp;
 371
 372     const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
 373     const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
 374     const float2 tdy             = sm_splineParams[splineIndexY];
 375     const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
 376     const float2 tdz             = sm_splineParams[splineIndexZ];
 377
 378     const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
 379     int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
 380     if (wrapY & (iy >= ny))
 381     {
 382         iy -= ny;
 383     }
 384     int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
 385     if (iz >= nz)
 386     {
 387         iz -= nz;
 388     }
 389     const int constOffset = iy * pnz + iz;
 390
 391     if (chargeCheck)
 392     {
 393         sumForceComponents(
 394                 &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridA);
 395     }
 396
 397     // Reduction of partial force contributions
 398     __local float sm_forces[atomsPerBlock * DIM];
 399
 400     __local float  sm_forceReduction[totalSharedMemory];
 401     __local float* sm_forceTemp[DIM];
 402
 403     reduce_atom_forces(sm_forces,
 404                        atomIndexLocal,
 405                        splineIndex,
 406                        lineIndex,
 407                        kernelParams.grid.realGridSizeFP,
 408                        fx,
 409                        fy,
 410                        fz,
 411                        sm_forceReduction,
 412                        sm_forceTemp);
 413     barrier(CLK_LOCAL_MEM_FENCE);
 414
 415     /* Calculating the final forces with no component branching, atomsPerBlock threads */
 416     const int   forceIndexLocal  = threadLocalId;
 417     const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
 418     const float scale            = kernelParams.current.scale;
 419     if (forceIndexLocal < atomsPerBlock)
 420     {
 421         calculateAndStoreGridForces(
 422                 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
 423     }
 424
 425 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
 426     /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
 427      * __syncwarp() in CUDA. #2519
 428      */
 429     barrier(CLK_LOCAL_MEM_FENCE);
 430 #endif
 431
 432     assert(atomsPerBlock <= warp_size);
 433
 434     /* Writing or adding the final forces component-wise, single warp */
 435     const int blockForcesSize = atomsPerBlock * DIM;
 436     const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
 437     const int iterThreads     = blockForcesSize / numIter;
 438     if (threadLocalId < iterThreads)
 439     {
 440 #pragma unroll
 441         for (int i = 0; i < numIter; i++)
 442         {
 443             const int outputIndexLocal  = i * iterThreads + threadLocalId;
 444             const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
 445             const float outputForceComponent = sm_forces[outputIndexLocal];
 446             gm_forces[outputIndexGlobal]     = outputForceComponent;
 447         }
 448     }
 449
 450     if (numGrids == 2)
 451     {
 452         barrier(CLK_LOCAL_MEM_FENCE);
 453         fx          = 0.0F;
 454         fy          = 0.0F;
 455         fz          = 0.0F;
 456         chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
 457         if (chargeCheck)
 458         {
 459             sumForceComponents(
 460                     &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridB);
 461         }
 462         reduce_atom_forces(sm_forces,
 463                            atomIndexLocal,
 464                            splineIndex,
 465                            lineIndex,
 466                            kernelParams.grid.realGridSizeFP,
 467                            fx,
 468                            fy,
 469                            fz,
 470                            sm_forceReduction,
 471                            sm_forceTemp);
 472         barrier(CLK_LOCAL_MEM_FENCE);
 473         if (forceIndexLocal < atomsPerBlock)
 474         {
 475             calculateAndStoreGridForces(sm_forces,
 476                                         forceIndexLocal,
 477                                         forceIndexGlobal,
 478                                         kernelParams.current.recipBox,
 479                                         1.0F - scale,
 480                                         gm_coefficientsB);
 481         }
 482
 483 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
 484         /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
 485          * __syncwarp() in CUDA. #2519
 486          */
 487         barrier(CLK_LOCAL_MEM_FENCE);
 488 #endif
 489
 490         /* Writing or adding the final forces component-wise, single warp */
 491         if (threadLocalId < iterThreads)
 492         {
 493 #pragma unroll
 494             for (int i = 0; i < numIter; i++)
 495             {
 496                 const int outputIndexLocal = i * iterThreads + threadLocalId;
 497                 const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
 498                 const float outputForceComponent = sm_forces[outputIndexLocal];
 499                 gm_forces[outputIndexGlobal] += outputForceComponent;
 500             }
 501         }
 502     }
 503 }