2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /*! \libinternal \file
37 * \brief Defines the GPU-agnostic PME GPU data structures
38 * (the host-side PME GPU data, and the GPU function parameters).
39 * \todo Due to Gerrit workflow and time constraints, some renaming/refactoring
40 * which does not impair the performance will be performed once
41 * most of the initial PME CUDA implementation is merged
42 * into the master branch (likely, after release 2017).
43 * This should include:
44 * -- bringing the structure/function names up to guidelines
45 * ---- pme_gpu_settings_t -> PmeGpuTasks
46 * -- refining GPU notation application (#2053)
47 * -- renaming coefficients to charges (?)
49 * \author Aleksei Iupinov <a.yupinov@gmail.com>
50 * \ingroup module_ewald
53 #ifndef GMX_EWALD_PME_GPU_TYPES_H
54 #define GMX_EWALD_PME_GPU_TYPES_H
61 #include "gromacs/math/vectypes.h"
62 #include "gromacs/utility/basedefinitions.h"
65 struct gmx_device_info_t;
67 /*! \brief Possible PME codepaths
68 * \todo: make this enum class with gmx_pme_t C++ refactoring
72 CPU, //!< Whole PME step is done on CPU
73 GPU, //!< Whole PME step is done on GPU
74 Hybrid, //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU.
77 #if GMX_GPU == GMX_GPU_CUDA
79 struct pme_gpu_cuda_t;
80 /*! \brief A typedef for including the GPU host data by pointer */
81 typedef pme_gpu_cuda_t pme_gpu_specific_t;
83 struct pme_gpu_cuda_kernel_params_t;
84 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
85 typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t;
89 /*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */
90 typedef int pme_gpu_specific_t;
91 /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
92 typedef int pme_gpu_kernel_params_t;
96 /* What follows is all the PME GPU function arguments,
97 * sorted into several device-side structures depending on the update rate.
98 * This is GPU agnostic (float3 replaced by float[3], etc.).
99 * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described
100 * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh.
104 * A GPU data structure for storing the constant PME data.
105 * This only has to be initialized once.
107 struct pme_gpu_const_params_t
109 /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */
111 /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
112 * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
113 float *d_virialAndEnergy;
117 * A GPU data structure for storing the PME data related to the grid sizes and cut-off.
118 * This only has to be updated at every DD step.
120 struct pme_gpu_grid_params_t
123 /*! \brief Real-space grid data dimensions. */
124 int realGridSize[DIM];
125 /*! \brief Real-space grid dimensions, only converted to floating point. */
126 float realGridSizeFP[DIM];
127 /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */
128 int realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */
129 /*! \brief Fourier grid dimensions. This counts the complex numbers! */
130 int complexGridSize[DIM];
131 /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
132 int complexGridSizePadded[DIM];
135 /*! \brief Real space grid. */
137 /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
138 float *d_fourierGrid;
140 /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
143 /*! \brief Grid spline values as in pme->bsp_mod
144 * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
146 float *d_splineModuli;
147 /*! \brief Offsets for X/Y/Z components of d_splineModuli */
148 int splineValuesOffset[DIM];
150 /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
151 float *d_fractShiftsTable;
152 /*! \brief Gridline indices lookup table
153 * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
154 int *d_gridlineIndicesTable;
155 /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
156 int tablesOffsets[DIM];
160 * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition.
161 * This only has to be updated every DD step.
163 struct pme_gpu_atom_params_t
165 /*! \brief Number of local atoms */
167 /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
168 * The coordinates themselves change and need to be copied to the GPU every MD step,
169 * but reallocation happens only at DD.
171 float *d_coordinates;
172 /*! \brief Pointer to the global GPU memory with input atom charges.
173 * The charges only need to be reallocated and copied to the GPU at DD step.
175 float *d_coefficients;
176 /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
177 * The forces change and need to be copied from (and possibly to) the GPU every MD step,
178 * but reallocation happens only at DD.
181 /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
182 * Computed on GPU in the spline calculation part.
184 int *d_gridlineIndices;
186 /* B-spline parameters are computed entirely on GPU every MD step, not copied.
187 * Unless we want to try something like GPU spread + CPU gather?
189 /*! \brief Pointer to the global GPU memory with B-spline values */
191 /*! \brief Pointer to the global GPU memory with B-spline derivative values */
196 * A GPU data structure for storing the PME data which might change every MD step.
198 struct pme_gpu_step_params_t
200 /* The box parameters. The box only changes size each step with pressure coupling enabled. */
202 * Reciprocal (inverted unit cell) box.
204 * The box is transposed as compared to the CPU pme->recipbox.
205 * Basically, spread uses matrix columns (while solve and gather use rows).
206 * This storage format might be not the most optimal since the box is always triangular so there are zeroes.
208 float recipBox[DIM][DIM];
209 /*! \brief The unit cell volume for solving. */
214 * A single structure encompassing almost all the PME data used in GPU kernels on device.
215 * This is inherited by the GPU framework-specific structure
216 * (pme_gpu_cuda_kernel_params_t in pme.cuh).
217 * This way, most code preparing the kernel parameters can be GPU-agnostic by casting
218 * the kernel parameter data pointer to pme_gpu_kernel_params_base_t.
220 struct pme_gpu_kernel_params_base_t
222 /*! \brief Constant data that is set once. */
223 pme_gpu_const_params_t constants;
224 /*! \brief Data dependent on the grid size/cutoff. */
225 pme_gpu_grid_params_t grid;
226 /*! \brief Data dependent on the DD and local atoms. */
227 pme_gpu_atom_params_t atoms;
228 /*! \brief Data that possibly changes on every MD step. */
229 pme_gpu_step_params_t step;
232 /* Here are the host-side structures */
235 * The PME GPU settings structure, included in the main PME GPU structure by value.
237 struct pme_gpu_settings_t
239 /* Permanent settings set on initialization */
240 /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */
241 bool performGPUSolve;
242 /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */
243 bool performGPUGather;
244 /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */
246 /*! \brief A convenience boolean which tells if PME decomposition is used. */
247 bool useDecomposition;
248 /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
249 * Only intended to be used by the test framework.
252 /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */
257 * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
258 * Buffers are managed by the PME GPU module.
260 struct pme_gpu_staging_t
262 /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
263 float *h_virialAndEnergy;
264 /*! \brief B-spline values intermediate host-side buffer. */
265 float *h_splineModuli;
267 /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
269 /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */
271 /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */
272 int *h_gridlineIndices;
276 * The PME GPU structure for all the data copied directly from the CPU PME structure.
277 * The copying is done when the CPU PME structure is already (re-)initialized
278 * (pme_gpu_reinit is called at the end of gmx_pme_init).
279 * All the variables here are named almost the same way as in gmx_pme_t.
280 * The types are different: pointers are replaced by vectors.
281 * TODO: use the shared data with the PME CPU.
282 * Included in the main PME GPU structure by value.
286 /*! \brief Grid count - currently always 1 on GPU */
288 /*! \brief Grid dimensions - nkx, nky, nkz */
290 /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
291 * TODO: find out if these are really needed for the CPU FFT compatibility.
294 /*! \brief PME interpolation order */
296 /*! \brief Ewald splitting coefficient for Coulomb */
298 /*! \brief Electrostatics parameter */
300 /*! \brief Gridline indices - nnx, nny, nnz */
302 /*! \brief Fractional shifts - fshx, fshy, fshz */
303 std::vector<real> fsh;
304 /*! \brief Precomputed B-spline values */
305 std::vector<real> bsp_mod[DIM];
306 /*! \brief The PME codepath being taken */
311 * The main PME GPU host structure, included in the PME CPU structure by pointer.
315 /*! \brief The information copied once per reinit from the CPU structure. */
316 std::shared_ptr<pme_shared_t> common; // TODO: make the CPU structure use the same type
318 /*! \brief The settings. */
319 pme_gpu_settings_t settings;
321 /*! \brief The host-side buffers.
322 * The device-side buffers are buried in kernelParams, but that will have to change.
324 pme_gpu_staging_t staging;
326 /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT.
327 * Used for kernel scheduling.
328 * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
329 * TODO: this and the next member represent a memory allocation/padding properties -
330 * what a container type should do ideally.
333 /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT
334 * if c_usePadding is true.
335 * Used only as a basic size for almost all the atom data allocations
336 * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
337 * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
338 * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
342 /*! \brief A pointer to the device used during the execution. */
343 gmx_device_info_t *deviceInfo;
345 /*! \brief A single structure encompassing all the PME data used on GPU.
346 * Its value is the only argument to all the PME GPU kernels.
347 * \todo Test whether this should be copied to the constant GPU memory once per MD step
348 * (or even less often with no box updates) instead of being an argument.
350 std::shared_ptr<pme_gpu_kernel_params_t> kernelParams;
352 /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */
353 std::shared_ptr<pme_gpu_specific_t> archSpecific; /* FIXME: make it an unique_ptr */