2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements PME force gathering in CUDA.
39 * \author Aleksei Iupinov <a.yupinov@gmail.com>
46 #include "gromacs/gpu_utils/cudautils.cuh"
47 #include "gromacs/utility/exceptions.h"
48 #include "gromacs/utility/gmxassert.h"
51 #include "pme-timings.cuh"
53 //! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
54 constexpr int c_gatherMaxWarpsPerBlock = 4;
55 //! Gathering max block size in threads
56 constexpr int c_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * warp_size;
57 //! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
58 constexpr int c_gatherMinBlocksPerMP = (GMX_PTX_ARCH < 300) ? GMX_CUDA_MAX_BLOCKS_PER_MP : (GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock);
61 * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
63 __device__ __forceinline__ float read_grid_size(const float *realGridSizeFP,
68 case XX: return realGridSizeFP[XX];
69 case YY: return realGridSizeFP[YY];
70 case ZZ: return realGridSizeFP[ZZ];
76 /*! \brief Reduce the partial force contributions.
78 * \tparam[in] order The PME order (must be 4).
79 * \tparam[in] atomDataSize The number of partial force contributions for each atom (currently order^2 == 16)
80 * \tparam[in] blockSize The CUDA block size
81 * \param[out] sm_forces Shared memory array with the output forces (number of elements is number of atoms per block)
82 * \param[in] atomIndexLocal Local atom index
83 * \param[in] splineIndex Spline index
84 * \param[in] lineIndex Line index (same as threadLocalId)
85 * \param[in] realGridSizeFP Local grid size constant
86 * \param[in] fx Input force partial component X
87 * \param[in] fy Input force partial component Y
88 * \param[in] fz Input force partial component Z
92 const int atomDataSize,
95 __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forces,
96 const int atomIndexLocal,
97 const int splineIndex,
99 const float *realGridSizeFP,
104 #if (GMX_PTX_ARCH >= 300)
105 if (!(order & (order - 1))) // Only for orders of power of 2
107 const unsigned int activeMask = c_fullWarpMask;
109 // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
110 // TODO: find out if this is the best in terms of transactions count
111 static_assert(order == 4, "Only order of 4 is implemented");
112 static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
113 const int width = atomDataSize;
115 fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
116 fy += gmx_shfl_up_sync (activeMask, fy, 1, width);
117 fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
124 fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
125 fz += gmx_shfl_up_sync (activeMask, fz, 2, width);
132 // By now fx contains intermediate quad sums of all 3 components:
133 // splineIndex 0 1 2 and 3 4 5 6 and 7 8...
134 // sum of... fx0 to fx3 fy0 to fy3 fz0 to fz3 fx4 to fx7 fy4 to fy7 fz4 to fz7 etc.
136 // We have to just further reduce those groups of 4
137 for (int delta = 4; delta < atomDataSize; delta <<= 1)
139 fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
142 const int dimIndex = splineIndex;
145 const float n = read_grid_size(realGridSizeFP, dimIndex);
146 *((float *)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
152 // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
153 // which are stored separately (first 2 dimensions only)
154 const int smemPerDim = warp_size;
155 const int smemReserved = (DIM - 1) * smemPerDim;
156 __shared__ float sm_forceReduction[smemReserved + blockSize];
157 __shared__ float *sm_forceTemp[DIM];
159 const int numWarps = blockSize / smemPerDim;
160 const int minStride = max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
163 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
165 int elementIndex = smemReserved + lineIndex;
166 // Store input force contributions
167 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
168 // Reduce to fit into smemPerDim (warp size)
170 for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
172 if (splineIndex < redStride)
174 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
178 // Last iteration - packing everything to be nearby, storing convenience pointer
179 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
180 int redStride = minStride;
181 if (splineIndex < redStride)
183 const int packedIndex = atomIndexLocal * redStride + splineIndex;
184 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
190 assert ((blockSize / warp_size) >= DIM);
191 //assert (atomsPerBlock <= warp_size);
193 const int warpIndex = lineIndex / warp_size;
194 const int dimIndex = warpIndex;
196 // First 3 warps can now process 1 dimension each
199 int sourceIndex = lineIndex % warp_size;
201 for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
203 if (!(splineIndex & redStride))
205 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
209 const float n = read_grid_size(realGridSizeFP, dimIndex);
211 const int atomIndex = sourceIndex / minStride;
212 if (sourceIndex == minStride * atomIndex)
214 *((float *)(&sm_forces[atomIndex]) + dimIndex) = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
221 * A CUDA kernel which gathers the atom forces from the grid.
222 * The grid is assumed to be wrapped in dimension Z.
224 * \tparam[in] order The PME order (must be 4 currently).
225 * \tparam[in] overwriteForces True: the forces are written to the output buffer;
226 * False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
227 * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
228 * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
229 * \param[in] kernelParams All the PME GPU data.
233 const bool overwriteForces,
237 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
238 __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
240 /* Global memory pointers */
241 const float * __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
242 const float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
243 const float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
244 const float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
245 const int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
246 float * __restrict__ gm_forces = kernelParams.atoms.d_forces;
249 const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
250 const int atomDataSize = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
251 const int blockSize = atomsPerBlock * atomDataSize;
253 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
255 /* These are the atom indices - for the shared and global memory */
256 const int atomIndexLocal = threadIdx.z;
257 const int atomIndexOffset = blockIndex * atomsPerBlock;
258 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
260 /* Early return for fully empty blocks at the end
261 * (should only happen on Fermi or billions of input atoms)
263 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
268 const int splineParamsSize = atomsPerBlock * DIM * order;
269 const int gridlineIndicesSize = atomsPerBlock * DIM;
270 __shared__ int sm_gridlineIndices[gridlineIndicesSize];
271 __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
273 /* Spline Y/Z coordinates */
274 const int ithy = threadIdx.y;
275 const int ithz = threadIdx.x;
277 /* These are the spline contribution indices in shared memory */
278 const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x; /* Relative to the current particle , 0..15 for order 4 */
279 const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
281 int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
282 + (threadIdx.y * blockDim.x)
285 /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
286 const int localGridlineIndicesIndex = threadLocalId;
287 const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
288 const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
289 if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
291 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
292 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
294 /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
295 const int localSplineParamsIndex = threadLocalId;
296 const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
297 const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
298 if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
300 sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
301 sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
302 assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
303 assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
311 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
312 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
314 if (chargeCheck & globalCheck)
316 const int nx = kernelParams.grid.realGridSize[XX];
317 const int ny = kernelParams.grid.realGridSize[YY];
318 const int nz = kernelParams.grid.realGridSize[ZZ];
319 const int pny = kernelParams.grid.realGridSizePadded[YY];
320 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
322 const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
323 const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
325 const int splineIndexBase = getSplineParamIndexBase<order>(warpIndex, atomWarpIndex);
326 const int splineIndexY = getSplineParamIndex<order>(splineIndexBase, YY, ithy);
327 const float2 tdy = sm_splineParams[splineIndexY];
328 const int splineIndexZ = getSplineParamIndex<order>(splineIndexBase, ZZ, ithz);
329 const float2 tdz = sm_splineParams[splineIndexZ];
331 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
332 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
333 if (wrapY & (iy >= ny))
337 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
342 const int constOffset = iy * pnz + iz;
345 for (int ithx = 0; (ithx < order); ithx++)
347 int ix = ixBase + ithx;
348 if (wrapX & (ix >= nx))
352 const int gridIndexGlobal = ix * pny * pnz + constOffset;
353 assert(gridIndexGlobal >= 0);
354 const float gridValue = gm_grid[gridIndexGlobal];
355 assert(isfinite(gridValue));
356 const int splineIndexX = getSplineParamIndex<order>(splineIndexBase, XX, ithx);
357 const float2 tdx = sm_splineParams[splineIndexX];
358 const float fxy1 = tdz.x * gridValue;
359 const float fz1 = tdz.y * gridValue;
360 fx += tdx.y * tdy.x * fxy1;
361 fy += tdx.x * tdy.y * fxy1;
362 fz += tdx.x * tdy.x * fz1;
366 // Reduction of partial force contributions
367 __shared__ float3 sm_forces[atomsPerBlock];
368 reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces,
369 atomIndexLocal, splineIndex, lineIndex,
370 kernelParams.grid.realGridSizeFP,
374 /* Calculating the final forces with no component branching, atomsPerBlock threads */
375 const int forceIndexLocal = threadLocalId;
376 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
377 const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
378 if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
380 const float3 atomForces = sm_forces[forceIndexLocal];
381 const float negCoefficient = -gm_coefficients[forceIndexGlobal];
383 result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
384 result.y = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x + kernelParams.current.recipBox[YY][YY] * atomForces.y);
385 result.z = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x + kernelParams.current.recipBox[YY][ZZ] * atomForces.y + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
386 sm_forces[forceIndexLocal] = result;
390 assert(atomsPerBlock <= warp_size);
392 /* Writing or adding the final forces component-wise, single warp */
393 const int blockForcesSize = atomsPerBlock * DIM;
394 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
395 const int iterThreads = blockForcesSize / numIter;
396 if (threadLocalId < iterThreads)
399 for (int i = 0; i < numIter; i++)
401 int outputIndexLocal = i * iterThreads + threadLocalId;
402 int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
403 const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
404 if (globalOutputCheck)
406 const float outputForceComponent = ((float *)sm_forces)[outputIndexLocal];
409 gm_forces[outputIndexGlobal] = outputForceComponent;
413 gm_forces[outputIndexGlobal] += outputForceComponent;
420 void pme_gpu_gather(PmeGpu *pmeGpu,
421 PmeForceOutputHandling forceTreatment,
425 /* Copying the input CPU forces for reduction */
426 if (forceTreatment != PmeForceOutputHandling::Set)
428 pme_gpu_copy_input_forces(pmeGpu);
431 const int order = pmeGpu->common->pme_order;
432 const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
434 if (!pme_gpu_performs_FFT(pmeGpu) || pme_gpu_is_testing(pmeGpu))
436 pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float *>(h_grid));
439 if (pme_gpu_is_testing(pmeGpu))
441 pme_gpu_copy_input_gather_atom_data(pmeGpu);
444 const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
445 GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
447 const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
448 auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
450 KernelLaunchConfig config;
451 config.blockSize[0] = config.blockSize[1] = order;
452 config.blockSize[2] = atomsPerBlock;
453 config.gridSize[0] = dimGrid.x;
454 config.gridSize[1] = dimGrid.y;
455 config.stream = pmeGpu->archSpecific->pmeStream;
459 GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
462 constexpr bool wrapX = true;
463 constexpr bool wrapY = true;
464 GMX_UNUSED_VALUE(wrapX);
465 GMX_UNUSED_VALUE(wrapY);
467 // TODO test different cache configs
469 int timingId = gtPME_GATHER;
470 void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
471 pme_gather_kernel<4, true, wrapX, wrapY> :
472 pme_gather_kernel<4, false, wrapX, wrapY>;
474 pme_gpu_start_timing(pmeGpu, timingId);
475 auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
476 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
477 launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
478 pme_gpu_stop_timing(pmeGpu, timingId);
480 pme_gpu_copy_output_forces(pmeGpu);