2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements helper routines for PME gather and spline routines.
39 * \author Aleksei Iupinov <a.yupinov@gmail.com>
46 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
49 #include "pme_gpu_utils.h"
52 //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
53 static const bool c_useAtomDataPrefetch = true;
56 * General purpose function for loading atom-related data from global to shared memory.
58 * \tparam[in] T Data type (float/int/...)
59 * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
60 * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
61 * \param[in] kernelParams Input PME CUDA data in constant memory.
62 * \param[out] sm_destination Shared memory array for output.
63 * \param[in] gm_source Global memory array for input.
65 template<typename T, const int atomsPerBlock, const int dataCountPerAtom>
66 __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
67 T* __restrict__ sm_destination,
68 const T* __restrict__ gm_source)
70 static_assert(c_usePadding,
71 "With padding disabled, index checking should be fixed to account for spline "
72 "theta/dtheta pr-warp alignment");
73 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
74 const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
75 const int localIndex = threadLocalIndex;
76 const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
77 const int globalIndex = globalIndexBase + localIndex;
78 const int globalCheck =
79 pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
80 if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
82 assert(isfinite(float(gm_source[globalIndex])));
83 sm_destination[localIndex] = gm_source[globalIndex];
88 * PME GPU spline parameter and gridline indices calculation.
89 * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
90 * First stage of the whole kernel.
92 * \tparam[in] order PME interpolation order.
93 * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for
94 * in the sizes of the shared memory arrays.
95 * \tparam[in] atomsPerWarp Number of atoms processed by a warp
96 * \tparam[in] writeSmDtheta Bool controling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
97 * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
98 * \param[in] kernelParams Input PME CUDA data in constant memory.
99 * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
100 * \param[in] atomX Atom coordinate of atom processed by thread.
101 * \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
102 * \param[out] sm_theta Atom spline values in the shared memory.
103 * \param[out] sm_dtheta Derivative of atom spline values in shared memory.
104 * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory.
107 template<const int order, const int atomsPerBlock, const int atomsPerWarp, const bool writeSmDtheta, const bool writeGlobal>
108 __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
109 const int atomIndexOffset,
111 const float atomCharge,
112 float* __restrict__ sm_theta,
113 float* __restrict__ sm_dtheta,
114 int* __restrict__ sm_gridlineIndices)
116 /* Global memory pointers for output */
117 float* __restrict__ gm_theta = kernelParams.atoms.d_theta;
118 float* __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
119 int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
121 /* Fractional coordinates */
122 __shared__ float sm_fractCoords[atomsPerBlock * DIM];
124 /* Thread index w.r.t. block */
125 const int threadLocalId =
126 (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
127 /* Warp index w.r.t. block - could probably be obtained easier? */
128 const int warpIndex = threadLocalId / warp_size;
129 /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
130 const int atomWarpIndex = threadIdx.z % atomsPerWarp;
131 /* Atom index w.r.t. block/shared memory */
132 const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
134 /* Atom index w.r.t. global memory */
135 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
136 /* Spline contribution index in one dimension */
137 const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
138 const int orderIndex = threadLocalIdXY / DIM;
139 /* Dimension index */
140 const int dimIndex = threadLocalIdXY % DIM;
142 /* Multi-purpose index of rvec/ivec atom data */
143 const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
145 float splineData[order];
147 const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
148 const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
150 /* we have 4 threads per atom, but can only use 3 here for the dimensions */
151 if (localCheck && globalCheck)
153 /* Indices interpolation */
157 int tableIndex, tInt;
159 assert(atomIndexLocal < DIM * atomsPerBlock);
160 /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
161 * puts them into local memory(!) instead of accessing the constant memory directly.
162 * That's the reason for the switch, to unroll explicitly.
163 * The commented parts correspond to the 0 components of the recipbox.
168 tableIndex = kernelParams.grid.tablesOffsets[XX];
169 n = kernelParams.grid.realGridSizeFP[XX];
170 t = atomX.x * kernelParams.current.recipBox[dimIndex][XX]
171 + atomX.y * kernelParams.current.recipBox[dimIndex][YY]
172 + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
176 tableIndex = kernelParams.grid.tablesOffsets[YY];
177 n = kernelParams.grid.realGridSizeFP[YY];
178 t = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + */ atomX.y
179 * kernelParams.current.recipBox[dimIndex][YY]
180 + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
184 tableIndex = kernelParams.grid.tablesOffsets[ZZ];
185 n = kernelParams.grid.realGridSizeFP[ZZ];
186 t = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + */ atomX
188 * kernelParams.current.recipBox[dimIndex][ZZ];
191 const float shift = c_pmeMaxUnitcellShift;
192 /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
195 assert(sharedMemoryIndex < atomsPerBlock * DIM);
196 sm_fractCoords[sharedMemoryIndex] = t - tInt;
199 assert(tInt < c_pmeNeighborUnitcellCount * n);
201 // TODO have shared table for both parameters to share the fetch, as index is always same?
202 // TODO compare texture/LDG performance
203 sm_fractCoords[sharedMemoryIndex] +=
204 fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
205 kernelParams.fractShiftsTableTexture, tableIndex);
206 sm_gridlineIndices[sharedMemoryIndex] =
207 fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
208 kernelParams.gridlineIndicesTableTexture, tableIndex);
211 gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] =
212 sm_gridlineIndices[sharedMemoryIndex];
216 /* B-spline calculation */
218 const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
222 int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
224 const float dr = sm_fractCoords[sharedMemoryIndex];
225 assert(isfinite(dr));
227 /* dr is relative offset from lower cell limit */
228 splineData[order - 1] = 0.0f;
230 splineData[0] = 1.0f - dr;
233 for (int k = 3; k < order; k++)
235 div = 1.0f / (k - 1.0f);
236 splineData[k - 1] = div * dr * splineData[k - 2];
238 for (int l = 1; l < (k - 1); l++)
240 splineData[k - l - 1] =
241 div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]);
243 splineData[0] = div * (1.0f - dr) * splineData[0];
246 const int thetaIndexBase =
247 getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
248 const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
249 /* only calculate dtheta if we are saving it to shared or global memory */
250 if (writeSmDtheta || writeGlobal)
252 /* Differentiation and storing the spline derivatives (dtheta) */
254 for (o = 0; o < order; o++)
256 const int thetaIndex =
257 getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
259 const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
260 assert(isfinite(dtheta));
261 assert(thetaIndex < order * DIM * atomsPerBlock);
264 sm_dtheta[thetaIndex] = dtheta;
268 const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
269 gm_dtheta[thetaGlobalIndex] = dtheta;
274 div = 1.0f / (order - 1.0f);
275 splineData[order - 1] = div * dr * splineData[order - 2];
277 for (int k = 1; k < (order - 1); k++)
279 splineData[order - k - 1] = div
280 * ((dr + k) * splineData[order - k - 2]
281 + (order - k - dr) * splineData[order - k - 1]);
283 splineData[0] = div * (1.0f - dr) * splineData[0];
285 /* Storing the spline values (theta) */
287 for (o = 0; o < order; o++)
289 const int thetaIndex =
290 getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
291 assert(thetaIndex < order * DIM * atomsPerBlock);
292 sm_theta[thetaIndex] = splineData[o];
293 assert(isfinite(sm_theta[thetaIndex]));
296 const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
297 gm_theta[thetaGlobalIndex] = splineData[o];