2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME force gathering in CUDA.
39 * \author Aleksei Iupinov <a.yupinov@gmail.com>
47 #include "pme-gpu-utils.h"
50 * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
52 __device__ __forceinline__ float read_grid_size(const float *realGridSizeFP,
57 case XX: return realGridSizeFP[XX];
58 case YY: return realGridSizeFP[YY];
59 case ZZ: return realGridSizeFP[ZZ];
65 /*! \brief Reduce the partial force contributions.
67 * \tparam[in] order The PME order (must be 4).
68 * \tparam[in] atomDataSize The number of partial force contributions for each atom (currently order^2 == 16)
69 * \tparam[in] blockSize The CUDA block size
70 * \param[out] sm_forces Shared memory array with the output forces (number of elements is number of atoms per block)
71 * \param[in] atomIndexLocal Local atom index
72 * \param[in] splineIndex Spline index
73 * \param[in] lineIndex Line index (same as threadLocalId)
74 * \param[in] realGridSizeFP Local grid size constant
75 * \param[in] fx Input force partial component X
76 * \param[in] fy Input force partial component Y
77 * \param[in] fz Input force partial component Z
81 const int atomDataSize,
84 __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forces,
85 const int atomIndexLocal,
86 const int splineIndex,
88 const float *realGridSizeFP,
93 #if (GMX_PTX_ARCH >= 300)
94 if (!(order & (order - 1))) // Only for orders of power of 2
96 const unsigned int activeMask = c_fullWarpMask;
98 // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
99 // TODO: find out if this is the best in terms of transactions count
100 static_assert(order == 4, "Only order of 4 is implemented");
101 static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
102 const int width = atomDataSize;
104 fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
105 fy += gmx_shfl_up_sync (activeMask, fy, 1, width);
106 fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
113 fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
114 fz += gmx_shfl_up_sync (activeMask, fz, 2, width);
121 // By now fx contains intermediate quad sums of all 3 components:
122 // splineIndex 0 1 2 and 3 4 5 6 and 7 8...
123 // sum of... fx0 to fx3 fy0 to fy3 fz0 to fz3 fx4 to fx7 fy4 to fy7 fz4 to fz7 etc.
125 // We have to just further reduce those groups of 4
126 for (int delta = 4; delta < atomDataSize; delta <<= 1)
128 fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
131 const int dimIndex = splineIndex;
134 const float n = read_grid_size(realGridSizeFP, dimIndex);
135 *((float *)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
141 // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
142 // which are stored separately (first 2 dimensions only)
143 const int smemPerDim = warp_size;
144 const int smemReserved = (DIM - 1) * smemPerDim;
145 __shared__ float sm_forceReduction[smemReserved + blockSize];
146 __shared__ float *sm_forceTemp[DIM];
148 const int numWarps = blockSize / smemPerDim;
149 const int minStride = max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
152 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
154 int elementIndex = smemReserved + lineIndex;
155 // Store input force contributions
156 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
157 // Reduce to fit into smemPerDim (warp size)
159 for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
161 if (splineIndex < redStride)
163 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
167 // Last iteration - packing everything to be nearby, storing convenience pointer
168 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
169 int redStride = minStride;
170 if (splineIndex < redStride)
172 const int packedIndex = atomIndexLocal * redStride + splineIndex;
173 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
179 assert ((blockSize / warp_size) >= DIM);
180 //assert (atomsPerBlock <= warp_size);
182 const int warpIndex = lineIndex / warp_size;
183 const int dimIndex = warpIndex;
185 // First 3 warps can now process 1 dimension each
188 int sourceIndex = lineIndex % warp_size;
190 for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
192 if (!(splineIndex & redStride))
194 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
198 const float n = read_grid_size(realGridSizeFP, dimIndex);
200 const int atomIndex = sourceIndex / minStride;
201 if (sourceIndex == minStride * atomIndex)
203 *((float *)(&sm_forces[atomIndex]) + dimIndex) = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
210 * A CUDA kernel which gathers the atom forces from the grid.
211 * The grid is assumed to be wrapped in dimension Z.
213 * \tparam[in] order The PME order (must be 4 currently).
214 * \tparam[in] overwriteForces True: the forces are written to the output buffer;
215 * False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
216 * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
217 * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
218 * \param[in] kernelParams All the PME GPU data.
222 const bool overwriteForces,
226 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
227 __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
229 /* Global memory pointers */
230 const float * __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
231 const float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
232 const float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
233 const float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
234 const int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
235 float * __restrict__ gm_forces = kernelParams.atoms.d_forces;
238 const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
239 const int atomDataSize = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
240 const int blockSize = atomsPerBlock * atomDataSize;
241 const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
243 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
245 /* These are the atom indices - for the shared and global memory */
246 const int atomIndexLocal = threadIdx.z;
247 const int atomIndexOffset = blockIndex * atomsPerBlock;
248 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
250 /* Early return for fully empty blocks at the end
251 * (should only happen on Fermi or billions of input atoms)
253 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
258 const int splineParamsSize = atomsPerBlock * DIM * order;
259 const int gridlineIndicesSize = atomsPerBlock * DIM;
260 __shared__ int sm_gridlineIndices[gridlineIndicesSize];
261 __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
263 /* Spline Y/Z coordinates */
264 const int ithy = threadIdx.y;
265 const int ithz = threadIdx.x;
267 /* These are the spline contribution indices in shared memory */
268 const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x; /* Relative to the current particle , 0..15 for order 4 */
269 const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
271 int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
272 + (threadIdx.y * blockDim.x)
275 /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
276 const int localGridlineIndicesIndex = threadLocalId;
277 const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
278 const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
279 if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
281 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
282 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
284 /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
285 const int localSplineParamsIndex = threadLocalId;
286 const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
287 const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
288 if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
290 sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
291 sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
292 assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
293 assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
301 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
302 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
304 if (chargeCheck & globalCheck)
306 const int nx = kernelParams.grid.realGridSize[XX];
307 const int ny = kernelParams.grid.realGridSize[YY];
308 const int nz = kernelParams.grid.realGridSize[ZZ];
309 const int pny = kernelParams.grid.realGridSizePadded[YY];
310 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
312 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
313 const int warpIndex = atomIndexLocal / atomsPerWarp;
315 const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
316 const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
317 const float2 tdy = sm_splineParams[splineIndexY];
318 const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
319 const float2 tdz = sm_splineParams[splineIndexZ];
321 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
322 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
323 if (wrapY & (iy >= ny))
327 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
332 const int constOffset = iy * pnz + iz;
335 for (int ithx = 0; (ithx < order); ithx++)
337 int ix = ixBase + ithx;
338 if (wrapX & (ix >= nx))
342 const int gridIndexGlobal = ix * pny * pnz + constOffset;
343 assert(gridIndexGlobal >= 0);
344 const float gridValue = gm_grid[gridIndexGlobal];
345 assert(isfinite(gridValue));
346 const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
347 const float2 tdx = sm_splineParams[splineIndexX];
348 const float fxy1 = tdz.x * gridValue;
349 const float fz1 = tdz.y * gridValue;
350 fx += tdx.y * tdy.x * fxy1;
351 fy += tdx.x * tdy.y * fxy1;
352 fz += tdx.x * tdy.x * fz1;
356 // Reduction of partial force contributions
357 __shared__ float3 sm_forces[atomsPerBlock];
358 reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces,
359 atomIndexLocal, splineIndex, lineIndex,
360 kernelParams.grid.realGridSizeFP,
364 /* Calculating the final forces with no component branching, atomsPerBlock threads */
365 const int forceIndexLocal = threadLocalId;
366 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
367 const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
368 if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
370 const float3 atomForces = sm_forces[forceIndexLocal];
371 const float negCoefficient = -gm_coefficients[forceIndexGlobal];
373 result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
374 result.y = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x + kernelParams.current.recipBox[YY][YY] * atomForces.y);
375 result.z = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x + kernelParams.current.recipBox[YY][ZZ] * atomForces.y + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
376 sm_forces[forceIndexLocal] = result;
380 assert(atomsPerBlock <= warp_size);
382 /* Writing or adding the final forces component-wise, single warp */
383 const int blockForcesSize = atomsPerBlock * DIM;
384 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
385 const int iterThreads = blockForcesSize / numIter;
386 if (threadLocalId < iterThreads)
389 for (int i = 0; i < numIter; i++)
391 int outputIndexLocal = i * iterThreads + threadLocalId;
392 int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
393 const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
394 if (globalOutputCheck)
396 const float outputForceComponent = ((float *)sm_forces)[outputIndexLocal];
399 gm_forces[outputIndexGlobal] = outputForceComponent;
403 gm_forces[outputIndexGlobal] += outputForceComponent;
410 //! Kernel instantiations
411 template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
412 template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);