2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME force gathering in CUDA.
39 * \author Aleksei Iupinov <a.yupinov@gmail.com>
46 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
49 #include "pme_calculate_splines.cuh"
50 #include "pme_gpu_utils.h"
54 * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
56 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
60 case XX: return realGridSizeFP[XX];
61 case YY: return realGridSizeFP[YY];
62 case ZZ: return realGridSizeFP[ZZ];
68 /*! \brief Reduce the partial force contributions.
70 * \tparam[in] order The PME order (must be 4).
71 * \tparam[in] atomDataSize The number of partial force contributions for each atom (currently
72 * order^2 == 16) \tparam[in] blockSize The CUDA block size \param[out] sm_forces Shared
73 * memory array with the output forces (number of elements is number of atoms per block) \param[in]
74 * atomIndexLocal Local atom index \param[in] splineIndex Spline index \param[in]
75 * lineIndex Line index (same as threadLocalId) \param[in] realGridSizeFP Local grid
76 * size constant \param[in] fx Input force partial component X \param[in] fy Input
77 * force partial component Y \param[in] fz Input force partial component Z
79 template<const int order, const int atomDataSize, const int blockSize>
80 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
81 const int atomIndexLocal,
82 const int splineIndex,
84 const float* realGridSizeFP,
89 if (!(order & (order - 1))) // Only for orders of power of 2
91 const unsigned int activeMask = c_fullWarpMask;
93 // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
94 // TODO: find out if this is the best in terms of transactions count
95 static_assert(order == 4, "Only order of 4 is implemented");
96 static_assert(atomDataSize <= warp_size,
97 "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
98 const int width = atomDataSize;
100 fx += __shfl_down_sync(activeMask, fx, 1, width);
101 fy += __shfl_up_sync(activeMask, fy, 1, width);
102 fz += __shfl_down_sync(activeMask, fz, 1, width);
109 fx += __shfl_down_sync(activeMask, fx, 2, width);
110 fz += __shfl_up_sync(activeMask, fz, 2, width);
117 // By now fx contains intermediate quad sums of all 3 components:
118 // splineIndex 0 1 2 and 3 4 5 6 and 7 8...
119 // sum of... fx0 to fx3 fy0 to fy3 fz0 to fz3 fx4 to fx7 fy4 to fy7 fz4 to fz7 etc.
121 // We have to just further reduce those groups of 4
122 for (int delta = 4; delta < atomDataSize; delta <<= 1)
124 fx += __shfl_down_sync(activeMask, fx, delta, width);
127 const int dimIndex = splineIndex;
130 const float n = read_grid_size(realGridSizeFP, dimIndex);
131 *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
136 // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
137 // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
138 const int smemPerDim = warp_size;
139 const int smemReserved = (DIM)*smemPerDim;
140 __shared__ float sm_forceReduction[smemReserved + blockSize];
141 __shared__ float* sm_forceTemp[DIM];
143 const int numWarps = blockSize / smemPerDim;
144 const int minStride =
145 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
148 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
150 int elementIndex = smemReserved + lineIndex;
151 // Store input force contributions
152 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
153 // sync here because two warps write data that the first one consumes below
155 // Reduce to fit into smemPerDim (warp size)
157 for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
159 if (splineIndex < redStride)
161 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
165 // Last iteration - packing everything to be nearby, storing convenience pointer
166 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
167 int redStride = minStride;
168 if (splineIndex < redStride)
170 const int packedIndex = atomIndexLocal * redStride + splineIndex;
171 sm_forceTemp[dimIndex][packedIndex] =
172 sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
177 assert((blockSize / warp_size) >= DIM);
178 // assert (atomsPerBlock <= warp_size);
180 const int warpIndex = lineIndex / warp_size;
181 const int dimIndex = warpIndex;
183 // First 3 warps can now process 1 dimension each
186 int sourceIndex = lineIndex % warp_size;
188 for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
190 if (!(splineIndex & redStride))
192 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
198 const float n = read_grid_size(realGridSizeFP, dimIndex);
199 const int atomIndex = sourceIndex / minStride;
201 if (sourceIndex == minStride * atomIndex)
203 *((float*)(&sm_forces[atomIndex]) + dimIndex) =
204 (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
211 * A CUDA kernel which gathers the atom forces from the grid.
212 * The grid is assumed to be wrapped in dimension Z.
214 * \tparam[in] order The PME order (must be 4 currently).
215 * \tparam[in] overwriteForces True: the forces are written to the output buffer;
216 * False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
217 * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
218 * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
219 * \tparam[in] readGlobal Tells if we should read spline values from global memory
220 * \tparam[in] useOrderThreads Tells if we should use order threads per atom (order*order used if false)
221 * \param[in] kernelParams All the PME GPU data.
223 template<const int order, const bool overwriteForces, const bool wrapX, const bool wrapY, const bool readGlobal, const bool useOrderThreads>
224 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
225 void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
227 /* Global memory pointers */
228 const float* __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
229 const float* __restrict__ gm_grid = kernelParams.grid.d_realGrid;
230 float* __restrict__ gm_forces = kernelParams.atoms.d_forces;
232 /* Global memory pointers for readGlobal */
233 const float* __restrict__ gm_theta = kernelParams.atoms.d_theta;
234 const float* __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
235 const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
241 const int atomsPerBlock =
242 useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom)
243 : (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
244 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
246 /* Number of data components and threads for a single atom */
247 const int atomDataSize = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
248 : c_pmeSpreadGatherThreadsPerAtom;
249 const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom
250 : c_pmeSpreadGatherAtomsPerWarp;
252 const int blockSize = atomsPerBlock * atomDataSize;
253 assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
255 /* These are the atom indices - for the shared and global memory */
256 const int atomIndexLocal = threadIdx.z;
257 const int atomIndexOffset = blockIndex * atomsPerBlock;
258 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
260 /* Early return for fully empty blocks at the end
261 * (should only happen for billions of input atoms)
263 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
267 // 4 warps per block, 8 atoms per warp *3 *4
268 const int splineParamsSize = atomsPerBlock * DIM * order;
269 const int gridlineIndicesSize = atomsPerBlock * DIM;
270 __shared__ int sm_gridlineIndices[gridlineIndicesSize];
271 __shared__ float sm_theta[splineParamsSize];
272 __shared__ float sm_dtheta[splineParamsSize];
274 /* Spline Z coordinates */
275 const int ithz = threadIdx.x;
277 /* These are the spline contribution indices in shared memory */
278 const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
279 const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y))
280 + splineIndex; /* And to all the block's particles */
282 const int threadLocalId =
283 (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
284 const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
289 const int localGridlineIndicesIndex = threadLocalId;
290 const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
291 const int globalCheckIndices = pme_gpu_check_atom_data_index(
292 globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
293 if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
295 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
296 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
298 /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
299 with order*order threads per atom, it is only required for each thread to load one data value */
302 const int iMax = useOrderThreads ? 3 : 1;
304 for (int i = iMin; i < iMax; i++)
306 int localSplineParamsIndex =
308 + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
309 int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
310 int globalCheckSplineParams = pme_gpu_check_atom_data_index(
311 globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
312 if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
314 sm_theta[localSplineParamsIndex] = gm_theta[globalSplineParamsIndex];
315 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
316 assert(isfinite(sm_theta[localSplineParamsIndex]));
317 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
324 /* Recaclulate Splines */
325 if (c_useAtomDataPrefetch)
328 __shared__ float sm_coefficients[atomsPerBlock];
330 __shared__ float sm_coordinates[DIM * atomsPerBlock];
331 /* Staging coefficients/charges */
332 pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
333 kernelParams.atoms.d_coefficients);
335 /* Staging coordinates */
336 pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
337 kernelParams.atoms.d_coordinates);
339 atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
340 atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
341 atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
342 atomCharge = sm_coefficients[atomIndexLocal];
346 atomCharge = gm_coefficients[atomIndexGlobal];
347 atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
348 atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
349 atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
351 calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
352 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
359 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
360 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
362 if (chargeCheck & globalCheck)
364 const int nx = kernelParams.grid.realGridSize[XX];
365 const int ny = kernelParams.grid.realGridSize[YY];
366 const int nz = kernelParams.grid.realGridSize[ZZ];
367 const int pny = kernelParams.grid.realGridSizePadded[YY];
368 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
370 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
371 const int warpIndex = atomIndexLocal / atomsPerWarp;
373 const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
374 const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
375 const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
377 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
378 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
386 const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
387 const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
388 for (int ithy = ithyMin; ithy < ithyMax; ithy++)
390 const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
391 const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
393 iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
394 if (wrapY & (iy >= ny))
398 constOffset = iy * pnz + iz;
401 for (int ithx = 0; (ithx < order); ithx++)
403 int ix = ixBase + ithx;
404 if (wrapX & (ix >= nx))
408 const int gridIndexGlobal = ix * pny * pnz + constOffset;
409 assert(gridIndexGlobal >= 0);
410 const float gridValue = gm_grid[gridIndexGlobal];
411 assert(isfinite(gridValue));
412 const int splineIndexX =
413 getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
414 const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
415 const float fxy1 = tdz.x * gridValue;
416 const float fz1 = tdz.y * gridValue;
417 fx += tdx.y * tdy.x * fxy1;
418 fy += tdx.x * tdy.y * fxy1;
419 fz += tdx.x * tdy.x * fz1;
424 // Reduction of partial force contributions
425 __shared__ float3 sm_forces[atomsPerBlock];
426 reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex, lineIndex,
427 kernelParams.grid.realGridSizeFP, fx, fy, fz);
430 /* Calculating the final forces with no component branching, atomsPerBlock threads */
431 const int forceIndexLocal = threadLocalId;
432 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
433 const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
434 if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
436 const float3 atomForces = sm_forces[forceIndexLocal];
437 const float negCoefficient = -gm_coefficients[forceIndexGlobal];
439 result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
440 result.y = negCoefficient
441 * (kernelParams.current.recipBox[XX][YY] * atomForces.x
442 + kernelParams.current.recipBox[YY][YY] * atomForces.y);
443 result.z = negCoefficient
444 * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
445 + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
446 + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
447 sm_forces[forceIndexLocal] = result;
451 assert(atomsPerBlock <= warp_size);
453 /* Writing or adding the final forces component-wise, single warp */
454 const int blockForcesSize = atomsPerBlock * DIM;
455 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
456 const int iterThreads = blockForcesSize / numIter;
457 if (threadLocalId < iterThreads)
460 for (int i = 0; i < numIter; i++)
462 int outputIndexLocal = i * iterThreads + threadLocalId;
463 int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
464 const int globalOutputCheck =
465 pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
466 if (globalOutputCheck)
468 const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
471 gm_forces[outputIndexGlobal] = outputForceComponent;
475 gm_forces[outputIndexGlobal] += outputForceComponent;
482 //! Kernel instantiations
483 template __global__ void pme_gather_kernel<4, true, true, true, true, true>(const PmeGpuCudaKernelParams);
484 template __global__ void pme_gather_kernel<4, true, true, true, true, false>(const PmeGpuCudaKernelParams);
485 template __global__ void pme_gather_kernel<4, false, true, true, true, true>(const PmeGpuCudaKernelParams);
486 template __global__ void pme_gather_kernel<4, false, true, true, true, false>(const PmeGpuCudaKernelParams);
487 template __global__ void pme_gather_kernel<4, true, true, true, false, true>(const PmeGpuCudaKernelParams);
488 template __global__ void pme_gather_kernel<4, true, true, true, false, false>(const PmeGpuCudaKernelParams);
489 template __global__ void pme_gather_kernel<4, false, true, true, false, true>(const PmeGpuCudaKernelParams);
490 template __global__ void pme_gather_kernel<4, false, true, true, false, false>(const PmeGpuCudaKernelParams);