2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME OpenCL force gathering kernel.
38 * When including this and other PME OpenCL kernel files, plenty of common
39 * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
40 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
42 * This file's kernels specifically expect the following definitions:
44 * - atomsPerBlock which expresses how many atoms are processed by a single work group
45 * - order which is a PME interpolation order
46 * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
47 * in dimension X/Y is to be used
49 * \author Aleksei Iupinov <a.yupinov@gmail.com>
52 #include "pme_gpu_calculate_splines.clh"
53 #include "pme_gpu_types.h"
55 #ifndef COMPILE_GATHER_HELPERS_ONCE
56 # define COMPILE_GATHER_HELPERS_ONCE
59 * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
61 inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
65 case XX: return realGridSizeFP[XX];
66 case YY: return realGridSizeFP[YY];
67 case ZZ: return realGridSizeFP[ZZ];
68 default: assert(false); break;
74 /*! \brief Reduce the partial force contributions.
76 * FIXME: this reduction should be simplified and improved, it does 3x16 force component
77 * reduction per 16 threads so no extra shared mem should be needed for intermediates
78 * or passing results back.
80 * \param[out] sm_forces Local memory array with the output forces (rvec).
81 * \param[in] atomIndexLocal Local atom index
82 * \param[in] splineIndex Spline index
83 * \param[in] lineIndex Line index (same as threadLocalId)
84 * \param[in] realGridSizeFP Local grid size constant
85 * \param[in] fx Input force partial component X
86 * \param[in] fy Input force partial component Y
87 * \param[in] fz Input force partial component Z
88 * \param[in,out] sm_forceReduction Reduction working buffer
89 * \param[in] sm_forceTemp Convenience pointers into \p sm_forceReduction
91 inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
92 const int atomIndexLocal,
93 const int splineIndex,
95 const float* realGridSizeFP,
99 __local float* __restrict__ sm_forceReduction,
100 __local float** __restrict__ sm_forceTemp)
103 // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
105 /* Number of data components and threads for a single atom */
106 # define atomDataSize threadsPerAtom
107 // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
108 // All those guys are defines and not consts, because they go into the local memory array size.
109 # define blockSize (atomsPerBlock * atomDataSize)
110 # define smemPerDim warp_size
111 # define smemReserved (DIM * smemPerDim)
113 const int numWarps = blockSize / smemPerDim;
114 const int minStride = max(1, atomDataSize / numWarps);
117 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
119 int elementIndex = smemReserved + lineIndex;
120 // Store input force contributions
121 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
122 # if (warp_size < 48)
123 // sync here when exec width is smaller than the size of the sm_forceReduction
124 // buffer flushed to local mem above (size 3*16) as different warps will consume
126 barrier(CLK_LOCAL_MEM_FENCE);
129 // Reduce to fit into smemPerDim (warp size)
131 for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
133 if (splineIndex < redStride)
135 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
138 barrier(CLK_LOCAL_MEM_FENCE);
139 // Last iteration - packing everything to be nearby, storing convenience pointer
140 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
141 int redStride = minStride;
142 if (splineIndex < redStride)
144 const int packedIndex = atomIndexLocal * redStride + splineIndex;
145 sm_forceTemp[dimIndex][packedIndex] =
146 sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
149 // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
150 # if (warp_size < 64)
151 barrier(CLK_LOCAL_MEM_FENCE);
155 # if (warp_size >= 64)
156 barrier(CLK_LOCAL_MEM_FENCE);
159 assert((blockSize / warp_size) >= DIM);
161 const int warpIndex = lineIndex / warp_size;
162 const int dimIndex = warpIndex;
164 // First 3 warps can now process 1 dimension each
167 const int sourceIndex = lineIndex % warp_size;
169 for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
171 if (!(splineIndex & redStride))
173 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
177 const float n = read_grid_size(realGridSizeFP, dimIndex);
178 const int atomIndex = sourceIndex / minStride;
179 if (sourceIndex == minStride * atomIndex)
181 sm_forces[atomIndex * DIM + dimIndex] =
182 (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
187 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
189 * \param[out] fx The force partial component in the X dimension.
190 * \param[out] fy The force partial component in the Y dimension.
191 * \param[out] fz The force partial component in the Z dimension.
192 * \param[in] ixBase The grid line index base value in the X dimension.
193 * \param[in] nx The grid real size in the X dimension.
194 * \param[in] pny The padded grid real size in the Y dimension.
195 * \param[in] pnz The padded grid real size in the Z dimension.
196 * \param[in] constOffset The offset to calculate the global grid index.
197 * \param[in] splineIndexBase The base value of the spline parameter index.
198 * \param[in] tdy The theta and dtheta in the Y dimension.
199 * \param[in] tdz The theta and dtheta in the Z dimension.
200 * \param[in] sm_splineParams Shared memory array of spline parameters.
201 * \param[in] gm_grid Global memory array of the grid to use.
203 inline void sumForceComponents(float* fx,
210 const int constOffset,
211 const int splineIndexBase,
214 __local const float2* __restrict__ sm_splineParams,
215 __global const float* __restrict__ gm_grid)
217 # pragma unroll order
218 for (int ithx = 0; (ithx < order); ithx++)
220 int ix = ixBase + ithx;
221 if (wrapX & (ix >= nx))
225 const int gridIndexGlobal = ix * pny * pnz + constOffset;
226 assert(gridIndexGlobal >= 0);
227 const float gridValue = gm_grid[gridIndexGlobal];
228 assert(isfinite(gridValue));
229 const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
230 const float2 tdx = sm_splineParams[splineIndexX];
231 const float fxy1 = tdz.x * gridValue;
232 const float fz1 = tdz.y * gridValue;
233 *fx += tdx.y * tdy.x * fxy1;
234 *fy += tdx.x * tdy.y * fxy1;
235 *fz += tdx.x * tdy.x * fz1;
239 /*! \brief Calculate the grid forces and store them in shared memory.
241 * \param[in,out] sm_forces Shared memory array with the output forces.
242 * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array.
243 * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array.
244 * \param[in] recipBox The reciprocal box.
245 * \param[in] scale The scale to use when calculating the forces (only relevant when
247 * \param[in] gm_coefficients Global memory array of the coefficients to use.
249 inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces,
250 const int forceIndexLocal,
251 const int forceIndexGlobal,
252 const float recipBox[DIM][DIM],
254 __global const float* __restrict__ gm_coefficients)
256 const float3 atomForces = vload3(forceIndexLocal, sm_forces);
257 float negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
259 result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
260 result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
261 result.z = negCoefficient
262 * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
263 + recipBox[ZZ][ZZ] * atomForces.z);
264 vstore3(result, forceIndexLocal, sm_forces);
267 #endif // COMPILE_GATHER_HELPERS_ONCE
270 * An OpenCL kernel which gathers the atom forces from the grid.
271 * The grid is assumed to be wrapped in dimension Z.
272 * Please see the file description for additional defines which this kernel expects.
274 * \param[in] kernelParams All the PME GPU data.
275 * \param[in] gm_coefficientsA Atom charges/coefficients in the unperturbed state, or FEP
277 * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only used
278 * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two
280 * \param[in] gm_gridA Global 3D grid for the unperturbed state, FEP
281 * state A or the single grid used for interpolated coefficients on one grid in FEP A/B.
282 * \param[in] gm_gridB Global 3D grid for FEP state B when using dual
283 * grids (when calculating energy and virials).
284 * \param[in] gm_theta Atom spline parameter values
285 * \param[in] gm_dtheta Atom spline parameter derivatives
286 * \param[in] gm_gridlineIndices Atom gridline indices (ivec)
287 * \param[in,out] gm_forces Atom forces (rvec)
289 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
290 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
291 __global const float* __restrict__ gm_coefficientsA,
292 __global const float* __restrict__ gm_coefficientsB,
293 __global const float* __restrict__ gm_gridA,
294 __global const float* __restrict__ gm_gridB,
295 __global const float* __restrict__ gm_theta,
296 __global const float* __restrict__ gm_dtheta,
297 __global const int* __restrict__ gm_gridlineIndices,
298 __global float* __restrict__ gm_forces)
300 assert(numGrids == 1 || numGrids == 2);
302 /* These are the atom indices - for the shared and global memory */
303 const int atomIndexLocal = get_local_id(ZZ);
304 const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
305 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
307 /* Some sizes which are defines and not consts because they go into the array size */
308 #define blockSize (atomsPerBlock * atomDataSize)
309 assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
310 #define smemPerDim warp_size
311 #define smemReserved (DIM * smemPerDim)
312 #define totalSharedMemory (smemReserved + blockSize)
313 #define gridlineIndicesSize (atomsPerBlock * DIM)
314 #define splineParamsSize (atomsPerBlock * DIM * order)
316 __local int sm_gridlineIndices[gridlineIndicesSize];
317 __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
319 /* Spline Y/Z coordinates */
320 const int ithy = get_local_id(YY);
321 const int ithz = get_local_id(XX);
323 assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
325 const int threadLocalId =
326 (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
329 /* These are the spline contribution indices in shared memory */
330 assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
331 const int splineIndex =
332 (int)(get_local_id(1) * get_local_size(0)
333 + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
334 const int lineIndex = threadLocalId; /* And to all the block's particles */
336 /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
337 const int localGridlineIndicesIndex = threadLocalId;
338 const int globalGridlineIndicesIndex =
339 (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
340 if (localGridlineIndicesIndex < gridlineIndicesSize)
342 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
343 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
345 /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
346 const int localSplineParamsIndex = threadLocalId;
347 const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
348 if (localSplineParamsIndex < splineParamsSize)
350 sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
351 sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
352 assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
353 assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
355 barrier(CLK_LOCAL_MEM_FENCE);
361 int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
363 const int nx = kernelParams.grid.realGridSize[XX];
364 const int ny = kernelParams.grid.realGridSize[YY];
365 const int nz = kernelParams.grid.realGridSize[ZZ];
366 const int pny = kernelParams.grid.realGridSizePadded[YY];
367 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
369 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
370 const int warpIndex = atomIndexLocal / atomsPerWarp;
372 const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
373 const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy);
374 const float2 tdy = sm_splineParams[splineIndexY];
375 const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz);
376 const float2 tdz = sm_splineParams[splineIndexZ];
378 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
379 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
380 if (wrapY & (iy >= ny))
384 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
389 const int constOffset = iy * pnz + iz;
394 &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridA);
397 // Reduction of partial force contributions
398 __local float sm_forces[atomsPerBlock * DIM];
400 __local float sm_forceReduction[totalSharedMemory];
401 __local float* sm_forceTemp[DIM];
403 reduce_atom_forces(sm_forces,
407 kernelParams.grid.realGridSizeFP,
413 barrier(CLK_LOCAL_MEM_FENCE);
415 /* Calculating the final forces with no component branching, atomsPerBlock threads */
416 const int forceIndexLocal = threadLocalId;
417 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
418 const float scale = kernelParams.current.scale;
419 if (forceIndexLocal < atomsPerBlock)
421 calculateAndStoreGridForces(
422 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
425 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
426 /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
427 * __syncwarp() in CUDA. #2519
429 barrier(CLK_LOCAL_MEM_FENCE);
432 assert(atomsPerBlock <= warp_size);
434 /* Writing or adding the final forces component-wise, single warp */
435 const int blockForcesSize = atomsPerBlock * DIM;
436 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
437 const int iterThreads = blockForcesSize / numIter;
438 if (threadLocalId < iterThreads)
441 for (int i = 0; i < numIter; i++)
443 const int outputIndexLocal = i * iterThreads + threadLocalId;
444 const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
445 const float outputForceComponent = sm_forces[outputIndexLocal];
446 gm_forces[outputIndexGlobal] = outputForceComponent;
452 barrier(CLK_LOCAL_MEM_FENCE);
456 chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
460 &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridB);
462 reduce_atom_forces(sm_forces,
466 kernelParams.grid.realGridSizeFP,
472 barrier(CLK_LOCAL_MEM_FENCE);
473 if (forceIndexLocal < atomsPerBlock)
475 calculateAndStoreGridForces(sm_forces,
478 kernelParams.current.recipBox,
483 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
484 /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
485 * __syncwarp() in CUDA. #2519
487 barrier(CLK_LOCAL_MEM_FENCE);
490 /* Writing or adding the final forces component-wise, single warp */
491 if (threadLocalId < iterThreads)
494 for (int i = 0; i < numIter; i++)
496 const int outputIndexLocal = i * iterThreads + threadLocalId;
497 const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
498 const float outputForceComponent = sm_forces[outputIndexLocal];
499 gm_forces[outputIndexGlobal] += outputForceComponent;