2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME OpenCL solving kernel.
38 * When including this and other PME OpenCL kernel files, plenty of common
39 * constants/macros are expected to be defined.
40 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
42 * This file's solving kernel specifically expects the following definitions for its flavors:
44 * - gridOrdering must be defined to either XYZ or YZX
45 * and corresponds to the dimension order of the grid (GridOrdering enum in CUDA kernels);
46 * - computeEnergyAndVirial must evaluate to true or false, and expresses
47 * whether the reduction is performed.
49 * \author Aleksei Iupinov <a.yupinov@gmail.com>
52 #include "gromacs/gpu_utils/vectype_ops.clh"
54 #include "pme_gpu_types.h"
57 * PME complex grid solver kernel function.
58 * Please see the file description for additional defines which this kernel expects.
60 * \param[in] kernelParams Input PME GPU data in constant memory.
61 * \param[in] gm_splineModuli B-Spline moduli.
62 * \param[out] gm_virialAndEnergy Reduced virial and enrgy (only with computeEnergyAndVirial ==
64 * \param[in,out] gm_grid Fourier grid to transform.
66 __attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
67 __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
68 __global const float* __restrict__ gm_splineModuli,
69 __global float* __restrict__ gm_virialAndEnergy,
70 __global float2* __restrict__ gm_grid)
72 /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
76 if (gridOrdering == YZX)
82 if (gridOrdering == XYZ)
89 __global const float* __restrict__ gm_splineValueMajor =
90 gm_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
91 __global const float* __restrict__ gm_splineValueMiddle =
92 gm_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
93 __global const float* __restrict__ gm_splineValueMinor =
94 gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
96 /* Various grid sizes and indices */
97 const int localOffsetMinor = 0; // unused
98 const int localOffsetMajor = 0; // unused
99 const int localOffsetMiddle = 0; // unused
100 const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim];
101 const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim];
102 const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
103 const int localCountMinor = kernelParams.grid.complexGridSize[minorDim];
104 const int nMajor = kernelParams.grid.realGridSize[majorDim];
105 const int nMiddle = kernelParams.grid.realGridSize[middleDim];
106 const int nMinor = kernelParams.grid.realGridSize[minorDim];
107 const int maxkMajor = (nMajor + 1) / 2; // X or Y
108 const int maxkMiddle = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
109 const int maxkMinor = (nMinor + 1) / 2; // Z or X => only check for YZX
111 /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
112 * Each block handles up to c_solveMaxWorkGroupSize cells -
113 * depending on the grid contiguous dimension size,
114 * that can range from a part of a single gridline to several complete gridlines.
116 const int threadLocalId = get_local_id(XX);
117 const int gridLineSize = localCountMinor;
118 const int gridLineIndex = threadLocalId / gridLineSize;
119 const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
120 const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
121 const int activeWarps = ((int)get_local_size(XX) / warp_size);
122 assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
123 const int indexMinor = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
124 const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
125 const int indexMajor = (int)get_group_id(ZZ);
127 /* Optional outputs */
136 assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
137 if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
138 & (gridLineIndex < gridLinesPerBlock))
140 /* The offset should be equal to the global thread index for coalesced access */
141 const int gridThreadIndex =
142 (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
143 __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
145 const int kMajor = indexMajor + localOffsetMajor;
146 /* Checking either X in XYZ, or Y in YZX cases */
147 const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));
149 const int kMiddle = indexMiddle + localOffsetMiddle;
150 float mMiddle = (float)kMiddle;
151 /* Checking Y in XYZ case */
152 if (gridOrdering == XYZ)
154 mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
156 const int kMinor = localOffsetMinor + indexMinor;
157 float mMinor = (float)kMinor;
158 /* Checking X in YZX case */
159 if (gridOrdering == YZX)
161 mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
163 /* We should skip the k-space point (0,0,0) */
164 const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
169 if (gridOrdering == YZX)
175 if (gridOrdering == XYZ)
182 /* 0.5 correction factor for the first and last components of a Z dimension */
183 float corner_fac = 1.0F;
184 const float z_corner_fac = 0.5F;
185 if (gridOrdering == YZX)
187 if ((kMiddle == 0) | (kMiddle == maxkMiddle))
189 corner_fac = z_corner_fac;
192 if (gridOrdering == XYZ)
194 if ((kMinor == 0) | (kMinor == maxkMinor))
196 corner_fac = z_corner_fac;
202 const float mhxk = mX * kernelParams.current.recipBox[XX][XX];
203 const float mhyk = mX * kernelParams.current.recipBox[XX][YY]
204 + mY * kernelParams.current.recipBox[YY][YY];
205 const float mhzk = mX * kernelParams.current.recipBox[XX][ZZ]
206 + mY * kernelParams.current.recipBox[YY][ZZ]
207 + mZ * kernelParams.current.recipBox[ZZ][ZZ];
209 const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
211 const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
212 * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
213 assert(isfinite(denom));
214 assert(denom != 0.0F);
215 const float tmp1 = exp(-kernelParams.grid.ewaldFactor * m2k);
216 const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
218 float2 gridValue = *gm_gridCell;
219 const float2 oldGridValue = gridValue;
221 gridValue.x *= etermk;
222 gridValue.y *= etermk;
223 *gm_gridCell = gridValue;
225 if (computeEnergyAndVirial)
228 2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
230 const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
231 const float ets2 = corner_fac * tmp1k;
234 const float ets2vf = ets2 * vfactor;
236 virxx = ets2vf * mhxk * mhxk - ets2;
237 virxy = ets2vf * mhxk * mhyk;
238 virxz = ets2vf * mhxk * mhzk;
239 viryy = ets2vf * mhyk * mhyk - ets2;
240 viryz = ets2vf * mhyk * mhzk;
241 virzz = ets2vf * mhzk * mhzk - ets2;
246 // This is only for reduction below. OpenCL 1.2: all local memory must be declared on kernel scope.
247 __local float sm_virialAndEnergy[c_virialAndEnergyCount * warp_size];
249 /* Optional energy/virial reduction */
250 if (computeEnergyAndVirial)
252 // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
254 /* Shared memory reduction with atomics.
255 * Each component is first reduced into warp_size positions in the shared memory;
256 * Then first few warps reduce everything further and add to the global memory.
257 * This can likely be improved, but is anyway faster than the previous straightforward reduction,
258 * which was using too much shared memory (for storing all 7 floats on each thread).
261 const int lane = threadLocalId & (warp_size - 1);
262 const int warpIndex = threadLocalId / warp_size;
263 const bool firstWarp = (warpIndex == 0);
266 sm_virialAndEnergy[0 * warp_size + lane] = virxx;
267 sm_virialAndEnergy[1 * warp_size + lane] = viryy;
268 sm_virialAndEnergy[2 * warp_size + lane] = virzz;
269 sm_virialAndEnergy[3 * warp_size + lane] = virxy;
270 sm_virialAndEnergy[4 * warp_size + lane] = virxz;
271 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
272 sm_virialAndEnergy[5 * warp_size + lane] = viryz;
273 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
274 sm_virialAndEnergy[6 * warp_size + lane] = energy;
276 barrier(CLK_LOCAL_MEM_FENCE);
279 atomicAdd_l_f(sm_virialAndEnergy + 0 * warp_size + lane, virxx);
280 atomicAdd_l_f(sm_virialAndEnergy + 1 * warp_size + lane, viryy);
281 atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
282 atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
283 atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
284 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
285 atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
286 // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
287 atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
289 barrier(CLK_LOCAL_MEM_FENCE);
291 const int numIter = (c_virialAndEnergyCount + activeWarps - 1) / activeWarps;
292 for (int i = 0; i < numIter; i++)
294 const int componentIndex = i * activeWarps + warpIndex;
295 if (componentIndex < c_virialAndEnergyCount)
297 const int targetIndex = componentIndex * warp_size + lane;
299 for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
301 if (lane < reductionStride)
303 sm_virialAndEnergy[targetIndex] +=
304 sm_virialAndEnergy[targetIndex + reductionStride];
306 #ifdef _NVIDIA_SOURCE_
307 /* FIXME: this execution happens within execution width aka warp, but somehow
308 * NVIDIA OpenCL of all things fails without the memory barrier here. #2519
310 barrier(CLK_LOCAL_MEM_FENCE);
315 atomicAdd_g_f(gm_virialAndEnergy + componentIndex, sm_virialAndEnergy[targetIndex]);