2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function definitions for performing the PME calculations on GPU.
39 * These are not meant to be exposed outside of the PME GPU code.
40 * As of now, their bodies are still in the common pme-gpu.cpp files.
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
43 * \ingroup module_ewald
46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
47 #define GMX_EWALD_PME_GPU_INTERNAL_H
49 #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
50 #include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros
51 #include "gromacs/utility/arrayref.h"
53 #include "pme-gpu-types.h" // for the inline functions accessing PmeGpu members
57 struct gmx_pme_t; // only used in pme_gpu_reinit
58 struct gmx_wallclock_gpu_pme_t;
59 struct pme_atomcomm_t;
67 //! Type of spline data
68 enum class PmeSplineDataType
71 Derivatives, // dtheta
72 }; //TODO move this into new and shiny pme.h (pme-types.h?)
74 //! PME grid dimension ordering (from major to minor)
75 enum class GridOrdering
81 /* Some general constants for PME GPU behaviour follow. */
83 /*! \brief \libinternal
84 * false: The atom data GPU buffers are sized precisely according to the number of atoms.
85 * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
86 * The atom index checks in the spread/gather code potentially hinder the performance.
87 * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms
88 * fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
89 * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
90 * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
91 * \todo Estimate performance differences
93 const bool c_usePadding = true;
95 /*! \brief \libinternal
96 * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
97 * true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
98 * Could be good for performance in specific systems with lots of neutral atoms.
99 * \todo Estimate performance differences.
101 const bool c_skipNeutralAtoms = false;
103 /*! \brief \libinternal
104 * Number of PME solve output floating point numbers.
105 * 6 for symmetric virial matrix + 1 for reciprocal energy.
107 const int c_virialAndEnergyCount = 7;
109 /* A block of CUDA-only functions that live in pme.cu */
111 /*! \libinternal \brief
112 * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
113 * Depends on CUDA-specific block sizes, needed for the atom data padding.
115 * \param[in] pmeGpu The PME GPU structure.
116 * \returns Number of atoms in a single GPU atom data chunk.
118 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(1)
120 /*! \libinternal \brief
121 * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
123 * \param[in] pmeGpu The PME GPU structure.
124 * \returns Number of atoms in a single GPU atom spline data chunk.
126 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(1)
128 /*! \libinternal \brief
129 * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
131 * \param[in] pmeGpu The PME GPU structure.
133 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
135 /*! \libinternal \brief
136 * Allocates the fixed size energy and virial buffer both on GPU and CPU.
138 * \param[in] pmeGpu The PME GPU structure.
140 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
142 /*! \libinternal \brief
143 * Frees the energy and virial memory both on GPU and CPU.
145 * \param[in] pmeGpu The PME GPU structure.
147 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
149 /*! \libinternal \brief
150 * Clears the energy and virial memory on GPU with 0.
151 * Should be called at the end of PME computation which returned energy/virial.
153 * \param[in] pmeGpu The PME GPU structure.
155 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
157 /*! \libinternal \brief
158 * Reallocates and copies the pre-computed B-spline values to the GPU.
160 * \param[in] pmeGpu The PME GPU structure.
162 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
164 /*! \libinternal \brief
165 * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
167 * \param[in] pmeGpu The PME GPU structure.
169 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
171 /*! \libinternal \brief
172 * Reallocates the GPU buffer for the PME forces.
174 * \param[in] pmeGpu The PME GPU structure.
176 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
178 /*! \libinternal \brief
179 * Frees the GPU buffer for the PME forces.
181 * \param[in] pmeGpu The PME GPU structure.
183 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
185 /*! \libinternal \brief
186 * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
187 * To be called e.g. after the bonded calculations.
189 * \param[in] pmeGpu The PME GPU structure.
191 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
193 /*! \libinternal \brief
194 * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
196 * \param[in] pmeGpu The PME GPU structure.
198 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
200 /*! \libinternal \brief
201 * Checks whether work in the PME GPU stream has completed.
203 * \param[in] pmeGpu The PME GPU structure.
205 * \returns True if work in the PME stream has completed.
207 CUDA_FUNC_QUALIFIER bool pme_gpu_stream_query(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(0)
209 /*! \libinternal \brief
210 * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
212 * \param[in] pmeGpu The PME GPU structure.
214 * Needs to be called on every DD step/in the beginning.
216 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
218 /*! \libinternal \brief
219 * Copies the input coordinates from the CPU buffer onto the GPU.
221 * \param[in] pmeGpu The PME GPU structure.
222 * \param[in] h_coordinates Input coordinates (XYZ rvec array).
224 * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
226 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
227 const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
229 /*! \libinternal \brief
230 * Frees the coordinates on the GPU.
232 * \param[in] pmeGpu The PME GPU structure.
234 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
236 /*! \libinternal \brief
237 * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
238 * Clears the padded part if needed.
240 * \param[in] pmeGpu The PME GPU structure.
241 * \param[in] h_coefficients The input atom charges/coefficients.
243 * Does not need to be done for every PME computation, only whenever the local charges change.
244 * (So, in the beginning of the run, or on DD step).
246 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
247 const float *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
249 /*! \libinternal \brief
250 * Frees the charges/coefficients on the GPU.
252 * \param[in] pmeGpu The PME GPU structure.
254 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
256 /*! \libinternal \brief
257 * Reallocates the buffers on the GPU and the host for the atoms spline data.
259 * \param[in] pmeGpu The PME GPU structure.
261 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
263 /*! \libinternal \brief
264 * Frees the buffers on the GPU for the atoms spline data.
266 * \param[in] pmeGpu The PME GPU structure.
268 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
270 /*! \libinternal \brief
271 * Reallocates the buffers on the GPU and the host for the particle gridline indices.
273 * \param[in] pmeGpu The PME GPU structure.
275 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
277 /*! \libinternal \brief
278 * Frees the buffer on the GPU for the particle gridline indices.
280 * \param[in] pmeGpu The PME GPU structure.
282 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
284 /*! \libinternal \brief
285 * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
287 * \param[in] pmeGpu The PME GPU structure.
289 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
291 /*! \libinternal \brief
292 * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
294 * \param[in] pmeGpu The PME GPU structure.
296 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
298 /*! \libinternal \brief
299 * Clears the real space grid on the GPU.
300 * Should be called at the end of each computation.
302 * \param[in] pmeGpu The PME GPU structure.
304 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
306 /*! \libinternal \brief
307 * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
309 * \param[in] pmeGpu The PME GPU structure.
311 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
313 /*! \libinternal \brief
314 * Frees the pre-computed fractional coordinates' shifts on the GPU.
316 * \param[in] pmeGpu The PME GPU structure.
318 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
320 /*! \libinternal \brief
321 * Copies the input real-space grid from the host to the GPU.
323 * \param[in] pmeGpu The PME GPU structure.
324 * \param[in] h_grid The host-side grid buffer.
326 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
327 float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
329 /*! \libinternal \brief
330 * Copies the output real-space grid from the GPU to the host.
332 * \param[in] pmeGpu The PME GPU structure.
333 * \param[out] h_grid The host-side grid buffer.
335 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
336 float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
338 /*! \libinternal \brief
339 * Copies the spread output spline data and gridline indices from the GPU to the host.
341 * \param[in] pmeGpu The PME GPU structure.
343 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
345 /*! \libinternal \brief
346 * Copies the gather input spline data and gridline indices from the host to the GPU.
348 * \param[in] pmeGpu The PME GPU structure.
350 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
352 /*! \libinternal \brief
353 * Waits for the grid copying to the host-side buffer after spreading to finish.
355 * \param[in] pmeGpu The PME GPU structure.
357 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
359 /*! \libinternal \brief
360 * Does the one-time GPU-framework specific PME initialization.
361 * For CUDA, the PME stream is created with the highest priority.
363 * \param[in] pmeGpu The PME GPU structure.
365 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
367 /*! \libinternal \brief
368 * Destroys the PME GPU-framework specific data.
369 * Should be called last in the PME GPU destructor.
371 * \param[in] pmeGpu The PME GPU structure.
373 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
375 /*! \libinternal \brief
376 * Initializes the PME GPU synchronization events.
378 * \param[in] pmeGpu The PME GPU structure.
380 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
382 /*! \libinternal \brief
383 * Destroys the PME GPU synchronization events.
385 * \param[in] pmeGpu The PME GPU structure.
387 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
389 /*! \libinternal \brief
390 * Initializes the CUDA FFT structures.
392 * \param[in] pmeGpu The PME GPU structure.
394 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
396 /*! \libinternal \brief
397 * Destroys the CUDA FFT structures.
399 * \param[in] pmeGpu The PME GPU structure.
401 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
403 /* Several CUDA event-based timing functions that live in pme-timings.cu */
405 /*! \libinternal \brief
406 * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
408 * \param[in] pmeGpu The PME GPU structure.
410 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
412 /*! \libinternal \brief
413 * Updates the internal list of active PME GPU stages (if timings are enabled).
415 * \param[in] pmeGpu The PME GPU data structure.
417 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
420 * Resets the PME GPU timings. To be called at the reset MD step.
422 * \param[in] pmeGpu The PME GPU structure.
424 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
426 /*! \libinternal \brief
427 * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
429 * \param[in] pmeGpu The PME GPU structure.
430 * \param[in] timings The gmx_wallclock_gpu_pme_t structure.
432 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
433 gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
435 /* The PME stages themselves */
437 /*! \libinternal \brief
438 * A GPU spline computation and charge spreading function.
440 * \param[in] pmeGpu The PME GPU structure.
441 * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
442 * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
443 * e.g. testing or host-side FFT)
444 * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
445 * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
447 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
448 int CUDA_FUNC_ARGUMENT(gridIndex),
449 real *CUDA_FUNC_ARGUMENT(h_grid),
450 bool CUDA_FUNC_ARGUMENT(computeSplines),
451 bool CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
453 /*! \libinternal \brief
454 * 3D FFT R2C/C2R routine.
456 * \param[in] pmeGpu The PME GPU structure.
457 * \param[in] direction Transform direction (real-to-complex or complex-to-real)
458 * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
460 CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
461 enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction),
462 const int CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM
464 /*! \libinternal \brief
465 * A GPU Fourier space solving function.
467 * \param[in] pmeGpu The PME GPU structure.
468 * \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
469 * \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information?
470 * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should also be performed.
472 CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
473 t_complex *CUDA_FUNC_ARGUMENT(h_grid),
474 GridOrdering CUDA_FUNC_ARGUMENT(gridOrdering),
475 bool CUDA_FUNC_ARGUMENT(computeEnergyAndVirial)) CUDA_FUNC_TERM
477 /*! \libinternal \brief
478 * A GPU force gathering function.
480 * \param[in] pmeGpu The PME GPU structure.
481 * \param[in] forceTreatment Tells how data in h_forces should be treated.
482 * TODO: determine efficiency/balance of host/device-side reductions.
483 * \param[in] h_grid The host-side grid buffer (used only in testing mode)
485 CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
486 PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
487 const float *CUDA_FUNC_ARGUMENT(h_grid)
491 /* The inlined convenience PME GPU status getters */
493 /*! \libinternal \brief
494 * Tells if PME runs on multiple GPUs with the decomposition.
496 * \param[in] pmeGpu The PME GPU structure.
497 * \returns True if PME runs on multiple GPUs, false otherwise.
499 gmx_inline bool pme_gpu_uses_dd(const PmeGpu *pmeGpu)
501 return !pmeGpu->settings.useDecomposition;
504 /*! \libinternal \brief
505 * Tells if PME performs the gathering stage on GPU.
507 * \param[in] pmeGpu The PME GPU structure.
508 * \returns True if the gathering is performed on GPU, false otherwise.
510 gmx_inline bool pme_gpu_performs_gather(const PmeGpu *pmeGpu)
512 return pmeGpu->settings.performGPUGather;
515 /*! \libinternal \brief
516 * Tells if PME performs the FFT stages on GPU.
518 * \param[in] pmeGpu The PME GPU structure.
519 * \returns True if FFT is performed on GPU, false otherwise.
521 gmx_inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGpu)
523 return pmeGpu->settings.performGPUFFT;
526 /*! \libinternal \brief
527 * Tells if PME performs the grid (un-)wrapping on GPU.
529 * \param[in] pmeGpu The PME GPU structure.
530 * \returns True if (un-)wrapping is performed on GPU, false otherwise.
532 gmx_inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGpu)
534 return pmeGpu->settings.useDecomposition;
537 /*! \libinternal \brief
538 * Tells if PME performs the grid solving on GPU.
540 * \param[in] pmeGpu The PME GPU structure.
541 * \returns True if solving is performed on GPU, false otherwise.
543 gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGpu)
545 return pmeGpu->settings.performGPUSolve;
548 /*! \libinternal \brief
549 * Enables or disables the testing mode.
550 * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
551 * and also makes the copies synchronous.
553 * \param[in] pmeGpu The PME GPU structure.
554 * \param[in] testing Should the testing mode be enabled, or disabled.
556 gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGpu, bool testing)
558 pmeGpu->settings.copyAllOutputs = testing;
559 pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
562 /*! \libinternal \brief
563 * Tells if PME is in the testing mode.
565 * \param[in] pmeGpu The PME GPU structure.
566 * \returns true if testing mode is enabled, false otherwise.
568 gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGpu)
570 return pmeGpu->settings.copyAllOutputs;
573 /* A block of C++ functions that live in pme-gpu-internal.cpp */
575 /*! \libinternal \brief
576 * Returns the GPU gathering staging forces buffer.
578 * \param[in] pmeGpu The PME GPU structure.
579 * \returns The input/output forces.
581 gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu);
583 /*! \libinternal \brief
584 * Returns the output virial and energy of the PME solving.
586 * \param[in] pmeGpu The PME GPU structure.
587 * \param[out] energy The output energy.
588 * \param[out] virial The output virial matrix.
590 void pme_gpu_get_energy_virial(const PmeGpu *pmeGpu, real *energy, matrix virial);
592 /*! \libinternal \brief
593 * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
595 * \param[in] pmeGpu The PME GPU structure.
596 * \param[in] box The unit cell box.
598 void pme_gpu_update_input_box(PmeGpu *pmeGpu, const matrix box);
600 /*! \libinternal \brief
601 * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
602 * If forces were computed, they will have arrived at the external host buffer provided to gather.
603 * If virial/energy were computed, they will have arrived into the internal staging buffer
604 * (even though that should have already happened before even launching the gather).
605 * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
606 * Additionally, device-side buffers are cleared asynchronously for the next computation.
608 * \param[in] pmeGpu The PME GPU structure.
610 void pme_gpu_finish_computation(const PmeGpu *pmeGpu);
612 //! A binary enum for spline data layout transformation
613 enum class PmeLayoutTransform
619 /*! \libinternal \brief
620 * Rearranges the atom spline data between the GPU and host layouts.
621 * Only used for test purposes so far, likely to be horribly slow.
623 * \param[in] pmeGpu The PME GPU structure.
624 * \param[out] atc The PME CPU atom data structure (with a single-threaded layout).
625 * \param[in] type The spline data type (values or derivatives).
626 * \param[in] dimIndex Dimension index.
627 * \param[in] transform Layout transform type
629 void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGpu, const pme_atomcomm_t *atc,
630 PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
632 /*! \libinternal \brief
633 * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
635 * \param[in] pmeGpu The PME GPU structure.
636 * \param[out] gridSize Pointer to the grid dimensions to fill in.
637 * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
639 void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
641 /*! \libinternal \brief
642 * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
644 * \param[in,out] pme The PME structure.
645 * \param[in,out] gpuInfo The GPU information structure.
646 * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
648 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
650 /*! \libinternal \brief
651 * Destroys the PME GPU data at the end of the run.
653 * \param[in] pmeGpu The PME GPU structure.
655 void pme_gpu_destroy(PmeGpu *pmeGpu);
657 /*! \libinternal \brief
658 * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
660 * \param[in] pmeGpu The PME GPU structure.
661 * \param[in] nAtoms The number of particles.
662 * \param[in] charges The pointer to the host-side array of particle charges.
664 * This is a function that should only be called in the beginning of the run and on domain decomposition.
665 * Should be called before the pme_gpu_set_io_ranges.
667 void pme_gpu_reinit_atoms(PmeGpu *pmeGpu,
669 const real *charges);
671 /*! \brief \libinternal
672 * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
674 * This clears the device-side working buffers in preparation for new computation.
676 * \param[in] pmeGpu The PME GPU structure.
678 void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);