2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME OpenCL force gathering kernel.
38 * When including this and other PME OpenCL kernel files, plenty of common
39 * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
40 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
42 * This file's kernels specifically expect the following definitions:
44 * - atomsPerBlock which expresses how many atoms are processed by a single work group
45 * - order which is a PME interpolation order
46 * - overwriteForces must evaluate to either true or false to specify whether the kernel
47 * overwrites or reduces into the forces buffer
48 * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
49 * in dimension X/Y is to be used
51 * \author Aleksei Iupinov <a.yupinov@gmail.com>
54 #include "pme_gpu_types.h"
55 #include "pme_gpu_utils.clh"
57 #ifndef COMPILE_GATHER_HELPERS_ONCE
58 #define COMPILE_GATHER_HELPERS_ONCE
61 * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
63 inline float read_grid_size(const float *realGridSizeFP,
68 case XX: return realGridSizeFP[XX];
69 case YY: return realGridSizeFP[YY];
70 case ZZ: return realGridSizeFP[ZZ];
76 /*! \brief Reduce the partial force contributions.
78 * FIXME: this reduction should be simplified and improved, it does 3x16 force component
79 * reduction per 16 threads so no extra shared mem should be needed for intermediates
80 * or passing results back.
82 * \param[out] sm_forces Local memory array with the output forces (rvec).
83 * \param[in] atomIndexLocal Local atom index
84 * \param[in] splineIndex Spline index
85 * \param[in] lineIndex Line index (same as threadLocalId)
86 * \param[in] realGridSizeFP Local grid size constant
87 * \param[in] fx Input force partial component X
88 * \param[in] fy Input force partial component Y
89 * \param[in] fz Input force partial component Z
90 * \param[in,out] sm_forceReduction Reduction working buffer
91 * \param[in] sm_forceTemp Convenience pointers into \p sm_forceReduction
93 inline void reduce_atom_forces(__local float * __restrict__ sm_forces,
94 const int atomIndexLocal,
95 const int splineIndex,
97 const float *realGridSizeFP,
101 __local float * __restrict__ sm_forceReduction,
102 __local float ** __restrict__ sm_forceTemp
106 // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
108 /* Number of data components and threads for a single atom */
109 #define atomDataSize threadsPerAtom
110 // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
111 // All those guys are defines and not consts, because they go into the local memory array size.
112 #define blockSize (atomsPerBlock * atomDataSize)
113 #define smemPerDim warp_size
114 #define smemReserved (DIM * smemPerDim)
116 const int numWarps = blockSize / smemPerDim;
117 const int minStride = max(1, atomDataSize / numWarps);
120 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
122 int elementIndex = smemReserved + lineIndex;
123 // Store input force contributions
124 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
126 #if !defined(_AMD_SOURCE_)
127 /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
128 * Different OpenCL compilers might have different ideas
129 * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
132 barrier(CLK_LOCAL_MEM_FENCE);
135 // Reduce to fit into smemPerDim (warp size)
137 for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
139 if (splineIndex < redStride)
141 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
144 barrier(CLK_LOCAL_MEM_FENCE);
145 // Last iteration - packing everything to be nearby, storing convenience pointer
146 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
147 int redStride = minStride;
148 if (splineIndex < redStride)
150 const int packedIndex = atomIndexLocal * redStride + splineIndex;
151 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
155 barrier(CLK_LOCAL_MEM_FENCE);
157 assert ((blockSize / warp_size) >= DIM);
159 const int warpIndex = lineIndex / warp_size;
160 const int dimIndex = warpIndex;
162 // First 3 warps can now process 1 dimension each
165 int sourceIndex = lineIndex % warp_size;
167 for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
169 if (!(splineIndex & redStride))
171 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
175 const float n = read_grid_size(realGridSizeFP, dimIndex);
177 const int atomIndex = sourceIndex / minStride;
178 if (sourceIndex == minStride * atomIndex)
180 sm_forces[atomIndex * DIM + dimIndex] = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
185 #endif //COMPILE_GATHER_HELPERS_ONCE
188 * An OpenCL kernel which gathers the atom forces from the grid.
189 * The grid is assumed to be wrapped in dimension Z.
190 * Please see the file description for additional defines which this kernel expects.
192 * \param[in] kernelParams All the PME GPU data.
193 * \param[in] gm_coefficients Atom charges/coefficients.
194 * \param[in] gm_grid Global 3D grid.
195 * \param[in] gm_theta Atom spline parameter values
196 * \param[in] gm_dtheta Atom spline parameter derivatives
197 * \param[in] gm_gridlineIndices Atom gridline indices (ivec)
198 * \param[in,out] gm_forces Atom forces (rvec)
200 __attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
201 __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
202 __global const float * __restrict__ gm_coefficients,
203 __global const float * __restrict__ gm_grid,
204 __global const float * __restrict__ gm_theta,
205 __global const float * __restrict__ gm_dtheta,
206 __global const int * __restrict__ gm_gridlineIndices,
207 __global float * __restrict__ gm_forces
210 /* These are the atom indices - for the shared and global memory */
211 const int atomIndexLocal = get_local_id(ZZ);
212 const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
213 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
215 /* Some sizes which are defines and not consts because they go into the array size */
216 #define blockSize (atomsPerBlock * atomDataSize)
217 assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
218 #define smemPerDim warp_size
219 #define smemReserved (DIM * smemPerDim)
220 #define totalSharedMemory (smemReserved + blockSize)
221 #define gridlineIndicesSize (atomsPerBlock * DIM)
222 #define splineParamsSize (atomsPerBlock * DIM * order)
224 __local int sm_gridlineIndices[gridlineIndicesSize];
225 __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
227 /* Spline Y/Z coordinates */
228 const int ithy = get_local_id(YY);
229 const int ithz = get_local_id(XX);
231 const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0);
233 /* These are the spline contribution indices in shared memory */
234 const int splineIndex = (get_local_id(1) * get_local_size(0) + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
235 const int lineIndex = threadLocalId; /* And to all the block's particles */
237 /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
238 const int localGridlineIndicesIndex = threadLocalId;
239 const int globalGridlineIndicesIndex = get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
240 const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
241 if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
243 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
244 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
246 /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
247 const int localSplineParamsIndex = threadLocalId;
248 const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
249 const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
250 if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
252 sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
253 sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
254 assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
255 assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
257 barrier(CLK_LOCAL_MEM_FENCE);
263 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
264 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
266 if (chargeCheck & globalCheck)
268 const int nx = kernelParams.grid.realGridSize[XX];
269 const int ny = kernelParams.grid.realGridSize[YY];
270 const int nz = kernelParams.grid.realGridSize[ZZ];
271 const int pny = kernelParams.grid.realGridSizePadded[YY];
272 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
274 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
275 const int warpIndex = atomIndexLocal / atomsPerWarp;
277 const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
278 const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy);
279 const float2 tdy = sm_splineParams[splineIndexY];
280 const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz);
281 const float2 tdz = sm_splineParams[splineIndexZ];
283 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
284 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
285 if (wrapY & (iy >= ny))
289 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
294 const int constOffset = iy * pnz + iz;
297 for (int ithx = 0; (ithx < order); ithx++)
299 int ix = ixBase + ithx;
300 if (wrapX & (ix >= nx))
304 const int gridIndexGlobal = ix * pny * pnz + constOffset;
305 assert(gridIndexGlobal >= 0);
306 const float gridValue = gm_grid[gridIndexGlobal];
307 assert(isfinite(gridValue));
308 const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
309 const float2 tdx = sm_splineParams[splineIndexX];
310 const float fxy1 = tdz.x * gridValue;
311 const float fz1 = tdz.y * gridValue;
312 fx += tdx.y * tdy.x * fxy1;
313 fy += tdx.x * tdy.y * fxy1;
314 fz += tdx.x * tdy.x * fz1;
318 // Reduction of partial force contributions
319 __local float sm_forces[atomsPerBlock * DIM];
321 __local float sm_forceReduction[totalSharedMemory];
322 __local float *sm_forceTemp[DIM];
324 reduce_atom_forces(sm_forces,
325 atomIndexLocal, splineIndex, lineIndex,
326 kernelParams.grid.realGridSizeFP,
330 barrier(CLK_LOCAL_MEM_FENCE);
332 /* Calculating the final forces with no component branching, atomsPerBlock threads */
333 const int forceIndexLocal = threadLocalId;
334 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
335 const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
336 if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
338 const float3 atomForces = vload3(forceIndexLocal, sm_forces);
339 const float negCoefficient = -gm_coefficients[forceIndexGlobal];
341 result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
342 result.y = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x +
343 kernelParams.current.recipBox[YY][YY] * atomForces.y);
344 result.z = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x +
345 kernelParams.current.recipBox[YY][ZZ] * atomForces.y +
346 kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
347 vstore3(result, forceIndexLocal, sm_forces);
350 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
351 /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
354 barrier(CLK_LOCAL_MEM_FENCE);
357 assert(atomsPerBlock <= warp_size);
359 /* Writing or adding the final forces component-wise, single warp */
360 const int blockForcesSize = atomsPerBlock * DIM;
361 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
362 const int iterThreads = blockForcesSize / numIter;
363 if (threadLocalId < iterThreads)
366 for (int i = 0; i < numIter; i++)
368 const int outputIndexLocal = i * iterThreads + threadLocalId;
369 const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
370 const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
371 if (globalOutputCheck)
373 const float outputForceComponent = sm_forces[outputIndexLocal];
376 gm_forces[outputIndexGlobal] = outputForceComponent;
380 gm_forces[outputIndexGlobal] += outputForceComponent;