src/gromacs/ewald/pme_solve.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME OpenCL solving kernel.
  38  * When including this and other PME OpenCL kernel files, plenty of common
  39  * constants/macros are expected to be defined.
  40  * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
  41  *
  42  * This file's solving kernel specifically expects the following definitions for its flavors:
  43  *
  44  * - gridOrdering must be defined to either XYZ or YZX
  45  * and corresponds to the dimension order of the grid (GridOrdering enum in CUDA kernels);
  46  * - computeEnergyAndVirial must evaluate to true or false, and expresses
  47  * whether the reduction is performed.
  48  *
  49  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  50  */
  51
  52 #include "gromacs/gpu_utils/vectype_ops.clh"
  53
  54 #include "pme_gpu_types.h"
  55
  56 /*! \brief
  57  * PME complex grid solver kernel function.
  58  * Please see the file description for additional defines which this kernel expects.
  59  *
  60  * \param[in]     kernelParams         Input PME GPU data in constant memory.
  61  * \param[in]     gm_splineModuli      B-Spline moduli.
  62  * \param[out]    gm_virialAndEnergy   Reduced virial and enrgy (only with computeEnergyAndVirial ==
  63  * true)
  64  * \param[in,out] gm_grid              Fourier grid to transform.
  65  */
  66 __attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
  67 __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
  68                                                        __global const float* __restrict__ gm_splineModuli,
  69                                                        __global float* __restrict__ gm_virialAndEnergy,
  70                                                        __global float2* __restrict__ gm_grid)
  71 {
  72     /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
  73     int majorDim;
  74     int middleDim;
  75     int minorDim;
  76     if (gridOrdering == YZX)
  77     {
  78         majorDim  = YY;
  79         middleDim = ZZ;
  80         minorDim  = XX;
  81     }
  82     if (gridOrdering == XYZ)
  83     {
  84         majorDim  = XX;
  85         middleDim = YY;
  86         minorDim  = ZZ;
  87     }
  88
  89     __global const float* __restrict__ gm_splineValueMajor =
  90             gm_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
  91     __global const float* __restrict__ gm_splineValueMiddle =
  92             gm_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
  93     __global const float* __restrict__ gm_splineValueMinor =
  94             gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
  95
  96     /* Various grid sizes and indices */
  97     const int localOffsetMinor  = 0; // unused
  98     const int localOffsetMajor  = 0; // unused
  99     const int localOffsetMiddle = 0; // unused
 100     const int localSizeMinor    = kernelParams.grid.complexGridSizePadded[minorDim];
 101     const int localSizeMiddle   = kernelParams.grid.complexGridSizePadded[middleDim];
 102     const int localCountMiddle  = kernelParams.grid.complexGridSize[middleDim];
 103     const int localCountMinor   = kernelParams.grid.complexGridSize[minorDim];
 104     const int nMajor            = kernelParams.grid.realGridSize[majorDim];
 105     const int nMiddle           = kernelParams.grid.realGridSize[middleDim];
 106     const int nMinor            = kernelParams.grid.realGridSize[minorDim];
 107     const int maxkMajor         = (nMajor + 1) / 2;  // X or Y
 108     const int maxkMiddle        = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
 109     const int maxkMinor         = (nMinor + 1) / 2;  // Z or X => only check for YZX
 110
 111     /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
 112      * Each block handles up to c_solveMaxWorkGroupSize cells -
 113      * depending on the grid contiguous dimension size,
 114      * that can range from a part of a single gridline to several complete gridlines.
 115      */
 116     const int threadLocalId     = get_local_id(XX);
 117     const int gridLineSize      = localCountMinor;
 118     const int gridLineIndex     = threadLocalId / gridLineSize;
 119     const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
 120     const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
 121     const int activeWarps       = ((int)get_local_size(XX) / warp_size);
 122     assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
 123     const int indexMinor  = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
 124     const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
 125     const int indexMajor  = (int)get_group_id(ZZ);
 126
 127     /* Optional outputs */
 128     float energy = 0.0F;
 129     float virxx  = 0.0F;
 130     float virxy  = 0.0F;
 131     float virxz  = 0.0F;
 132     float viryy  = 0.0F;
 133     float viryz  = 0.0F;
 134     float virzz  = 0.0F;
 135
 136     assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
 137     if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
 138         & (gridLineIndex < gridLinesPerBlock))
 139     {
 140         /* The offset should be equal to the global thread index for coalesced access */
 141         const int gridThreadIndex =
 142                 (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
 143         __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
 144
 145         const int kMajor = indexMajor + localOffsetMajor;
 146         /* Checking either X in XYZ, or Y in YZX cases */
 147         const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));
 148
 149         const int kMiddle = indexMiddle + localOffsetMiddle;
 150         float     mMiddle = (float)kMiddle;
 151         /* Checking Y in XYZ case */
 152         if (gridOrdering == XYZ)
 153         {
 154             mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
 155         }
 156         const int kMinor = localOffsetMinor + indexMinor;
 157         float     mMinor = (float)kMinor;
 158         /* Checking X in YZX case */
 159         if (gridOrdering == YZX)
 160         {
 161             mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
 162         }
 163         /* We should skip the k-space point (0,0,0) */
 164         const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
 165
 166         float mX;
 167         float mY;
 168         float mZ;
 169         if (gridOrdering == YZX)
 170         {
 171             mX = mMinor;
 172             mY = mMajor;
 173             mZ = mMiddle;
 174         }
 175         if (gridOrdering == XYZ)
 176         {
 177             mX = mMajor;
 178             mY = mMiddle;
 179             mZ = mMinor;
 180         }
 181
 182         /* 0.5 correction factor for the first and last components of a Z dimension */
 183         float       corner_fac   = 1.0F;
 184         const float z_corner_fac = 0.5F;
 185         if (gridOrdering == YZX)
 186         {
 187             if ((kMiddle == 0) | (kMiddle == maxkMiddle))
 188             {
 189                 corner_fac = z_corner_fac;
 190             }
 191         }
 192         if (gridOrdering == XYZ)
 193         {
 194             if ((kMinor == 0) | (kMinor == maxkMinor))
 195             {
 196                 corner_fac = z_corner_fac;
 197             }
 198         }
 199
 200         if (notZeroPoint)
 201         {
 202             const float mhxk = mX * kernelParams.current.recipBox[XX][XX];
 203             const float mhyk = mX * kernelParams.current.recipBox[XX][YY]
 204                                + mY * kernelParams.current.recipBox[YY][YY];
 205             const float mhzk = mX * kernelParams.current.recipBox[XX][ZZ]
 206                                + mY * kernelParams.current.recipBox[YY][ZZ]
 207                                + mZ * kernelParams.current.recipBox[ZZ][ZZ];
 208
 209             const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
 210             assert(m2k != 0.0F);
 211             const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
 212                                 * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
 213             assert(isfinite(denom));
 214             assert(denom != 0.0F);
 215             const float tmp1   = exp(-kernelParams.grid.ewaldFactor * m2k);
 216             const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
 217
 218             float2       gridValue    = *gm_gridCell;
 219             const float2 oldGridValue = gridValue;
 220
 221             gridValue.x *= etermk;
 222             gridValue.y *= etermk;
 223             *gm_gridCell = gridValue;
 224
 225             if (computeEnergyAndVirial)
 226             {
 227                 const float tmp1k =
 228                         2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
 229
 230                 const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
 231                 const float ets2    = corner_fac * tmp1k;
 232                 energy              = ets2;
 233
 234                 const float ets2vf = ets2 * vfactor;
 235
 236                 virxx = ets2vf * mhxk * mhxk - ets2;
 237                 virxy = ets2vf * mhxk * mhyk;
 238                 virxz = ets2vf * mhxk * mhzk;
 239                 viryy = ets2vf * mhyk * mhyk - ets2;
 240                 viryz = ets2vf * mhyk * mhzk;
 241                 virzz = ets2vf * mhzk * mhzk - ets2;
 242             }
 243         }
 244     }
 245
 246     // This is only for reduction below. OpenCL 1.2: all local memory must be declared on kernel scope.
 247     __local float sm_virialAndEnergy[c_virialAndEnergyCount * warp_size];
 248
 249     /* Optional energy/virial reduction */
 250     if (computeEnergyAndVirial)
 251     {
 252         // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
 253
 254         /* Shared memory reduction with atomics.
 255          * Each component is first reduced into warp_size positions in the shared memory;
 256          * Then first few warps reduce everything further and add to the global memory.
 257          * This can likely be improved, but is anyway faster than the previous straightforward reduction,
 258          * which was using too much shared memory (for storing all 7 floats on each thread).
 259          */
 260
 261         const int  lane      = threadLocalId & (warp_size - 1);
 262         const int  warpIndex = threadLocalId / warp_size;
 263         const bool firstWarp = (warpIndex == 0);
 264         if (firstWarp)
 265         {
 266             sm_virialAndEnergy[0 * warp_size + lane] = virxx;
 267             sm_virialAndEnergy[1 * warp_size + lane] = viryy;
 268             sm_virialAndEnergy[2 * warp_size + lane] = virzz;
 269             sm_virialAndEnergy[3 * warp_size + lane] = virxy;
 270             sm_virialAndEnergy[4 * warp_size + lane] = virxz;
 271             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
 272             sm_virialAndEnergy[5 * warp_size + lane] = viryz;
 273             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
 274             sm_virialAndEnergy[6 * warp_size + lane] = energy;
 275         }
 276         barrier(CLK_LOCAL_MEM_FENCE);
 277         if (!firstWarp)
 278         {
 279             atomicAdd_l_f(sm_virialAndEnergy + 0 * warp_size + lane, virxx);
 280             atomicAdd_l_f(sm_virialAndEnergy + 1 * warp_size + lane, viryy);
 281             atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
 282             atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
 283             atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
 284             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
 285             atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
 286             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
 287             atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
 288         }
 289         barrier(CLK_LOCAL_MEM_FENCE);
 290
 291         const int numIter = (c_virialAndEnergyCount + activeWarps - 1) / activeWarps;
 292         for (int i = 0; i < numIter; i++)
 293         {
 294             const int componentIndex = i * activeWarps + warpIndex;
 295             if (componentIndex < c_virialAndEnergyCount)
 296             {
 297                 const int targetIndex = componentIndex * warp_size + lane;
 298 #pragma unroll
 299                 for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
 300                 {
 301                     if (lane < reductionStride)
 302                     {
 303                         sm_virialAndEnergy[targetIndex] +=
 304                                 sm_virialAndEnergy[targetIndex + reductionStride];
 305                     }
 306 #ifdef _NVIDIA_SOURCE_
 307                     /* FIXME: this execution happens within execution width aka warp, but somehow
 308                      * NVIDIA OpenCL of all things fails without the memory barrier here. #2519
 309                      */
 310                     barrier(CLK_LOCAL_MEM_FENCE);
 311 #endif
 312                 }
 313                 if (lane == 0)
 314                 {
 315                     atomicAdd_g_f(gm_virialAndEnergy + componentIndex, sm_virialAndEnergy[targetIndex]);
 316                 }
 317             }
 318         }
 319     }
 320 }