src/gromacs/ewald/pme-gpu-internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme-gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
  50
  51 #include "pme-gpu-types.h"                     // for the inline functions accessing pme_gpu_t members
  52
  53 struct gmx_hw_info_t;
  54 struct gmx_gpu_opt_t;
  55 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  56 struct t_commrec;
  57 struct gmx_wallclock_gpu_pme_t;
  58 struct pme_atomcomm_t;
  59
  60 namespace gmx
  61 {
  62 class MDLogger;
  63 }
  64
  65 /* Some general constants for PME GPU behaviour follow. */
  66
  67 /*! \brief \libinternal
  68  * false: The atom data GPU buffers are sized precisely according to the number of atoms.
  69  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  70  *        The atom index checks in the spread/gather code potentially hinder the performance.
  71  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
  72  *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
  73  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  74  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  75  * \todo Estimate performance differences
  76  */
  77 const bool c_usePadding = true;
  78
  79 /*! \brief \libinternal
  80  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  81  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
  82  *        Could be good for performance in specific systems with lots of neutral atoms.
  83  * \todo Estimate performance differences.
  84  */
  85 const bool c_skipNeutralAtoms = false;
  86
  87 /*! \brief \libinternal
  88  * Number of PME solve output floating point numbers.
  89  * 6 for symmetric virial matrix + 1 for reciprocal energy.
  90  */
  91 const int c_virialAndEnergyCount = 7;
  92
  93 /* A block of CUDA-only functions that live in pme.cu */
  94
  95 /*! \libinternal \brief
  96  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
  97  * Depends on CUDA-specific block sizes, needed for the atom data padding.
  98  *
  99  * \param[in] pmeGPU            The PME GPU structure.
 100  * \returns   Number of atoms in a single GPU atom data chunk.
 101  */
 102 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 103
 104 /*! \libinternal \brief
 105  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 106  *
 107  * \param[in] pmeGPU            The PME GPU structure.
 108  * \returns   Number of atoms in a single GPU atom spline data chunk.
 109  */
 110 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 111
 112 /*! \libinternal \brief
 113  * Synchronizes the current step, waiting for the GPU kernels/transfers to finish.
 114  *
 115  * \param[in] pmeGPU            The PME GPU structure.
 116  */
 117 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 118
 119 /*! \libinternal \brief
 120  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 121  *
 122  * \param[in] pmeGPU            The PME GPU structure.
 123  */
 124 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 125
 126 /*! \libinternal \brief
 127  * Frees the energy and virial memory both on GPU and CPU.
 128  *
 129  * \param[in] pmeGPU            The PME GPU structure.
 130  */
 131 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 132
 133 /*! \libinternal \brief
 134  * Clears the energy and virial memory on GPU with 0.
 135  * Should be called at the end of the energy/virial calculation step.
 136  *
 137  * \param[in] pmeGPU            The PME GPU structure.
 138  */
 139 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 140
 141 /*! \libinternal \brief
 142  * Reallocates and copies the pre-computed B-spline values to the GPU.
 143  *
 144  * \param[in] pmeGPU             The PME GPU structure.
 145  */
 146 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 147
 148 /*! \libinternal \brief
 149  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 150  *
 151  * \param[in] pmeGPU             The PME GPU structure.
 152  */
 153 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 154
 155 /*! \libinternal \brief
 156  * Reallocates the GPU buffer for the PME forces.
 157  *
 158  * \param[in] pmeGPU             The PME GPU structure.
 159  */
 160 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 161
 162 /*! \libinternal \brief
 163  * Frees the GPU buffer for the PME forces.
 164  *
 165  * \param[in] pmeGPU             The PME GPU structure.
 166  */
 167 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 168
 169 /*! \libinternal \brief
 170  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 171  * To be called e.g. after the bonded calculations.
 172  *
 173  * \param[in] pmeGPU             The PME GPU structure.
 174  * \param[in] h_forces           The input forces rvec buffer.
 175  */
 176 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 177                                                    const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 178
 179 /*! \libinternal \brief
 180  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 181  *
 182  * \param[in] pmeGPU             The PME GPU structure.
 183  * \param[out] h_forces          The output forces rvec buffer.
 184  */
 185 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 186                                                     float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 187
 188 /*! \libinternal \brief
 189  * Waits for the PME GPU output forces copying to the CPU buffer to finish.
 190  *
 191  * \param[in] pmeGPU            The PME GPU structure.
 192  */
 193 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 194
 195 /*! \libinternal \brief
 196  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 197  *
 198  * \param[in] pmeGPU            The PME GPU structure.
 199  *
 200  * Needs to be called on every DD step/in the beginning.
 201  */
 202 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 203
 204 /*! \libinternal \brief
 205  * Copies the input coordinates from the CPU buffer onto the GPU.
 206  *
 207  * \param[in] pmeGPU            The PME GPU structure.
 208  * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
 209  *
 210  * Needs to be called every MD step. The coordinates are then used in the spline calculation.
 211  */
 212 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 213                                                         const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
 214
 215 /*! \libinternal \brief
 216  * Frees the coordinates on the GPU.
 217  *
 218  * \param[in] pmeGPU            The PME GPU structure.
 219  */
 220 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 221
 222 /*! \libinternal \brief
 223  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 224  * Clears the padded part if needed.
 225  *
 226  * \param[in] pmeGPU            The PME GPU structure.
 227  * \param[in] h_coefficients    The input atom charges/coefficients.
 228  *
 229  * Does not need to be done every MD step, only whenever the local charges change.
 230  * (So, in the beginning of the run, or on DD step).
 231  */
 232 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 233                                                                      const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
 234
 235 /*! \libinternal \brief
 236  * Frees the charges/coefficients on the GPU.
 237  *
 238  * \param[in] pmeGPU             The PME GPU structure.
 239  */
 240 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 241
 242 /*! \libinternal \brief
 243  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 244  *
 245  * \param[in] pmeGPU            The PME GPU structure.
 246  */
 247 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 248
 249 /*! \libinternal \brief
 250  * Frees the buffers on the GPU for the atoms spline data.
 251  *
 252  * \param[in] pmeGPU            The PME GPU structure.
 253  */
 254 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 255
 256 /*! \libinternal \brief
 257  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 258  *
 259  * \param[in] pmeGPU            The PME GPU structure.
 260  */
 261 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 262
 263 /*! \libinternal \brief
 264  * Frees the buffer on the GPU for the particle gridline indices.
 265  *
 266  * \param[in] pmeGPU            The PME GPU structure.
 267  */
 268 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 269
 270 /*! \libinternal \brief
 271  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 272  *
 273  * \param[in] pmeGPU            The PME GPU structure.
 274  */
 275 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 276
 277 /*! \libinternal \brief
 278  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 279  *
 280  * \param[in] pmeGPU            The PME GPU structure.
 281  */
 282 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 283
 284 /*! \libinternal \brief
 285  * Clears the real space grid on the GPU.
 286  * Should be called at the end of each MD step.
 287  *
 288  * \param[in] pmeGPU            The PME GPU structure.
 289  */
 290 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 291
 292 /*! \libinternal \brief
 293  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 294  *
 295  * \param[in] pmeGPU            The PME GPU structure.
 296  */
 297 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 298
 299 /*! \libinternal \brief
 300  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 301  *
 302  * \param[in] pmeGPU            The PME GPU structure.
 303  */
 304 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 305
 306 /*! \libinternal \brief
 307  * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
 308  *
 309  * \param[in] pmeGPU  The PME GPU structure.
 310  */
 311 CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 312
 313 /*! \libinternal \brief
 314  * Copies the input real-space grid from the host to the GPU.
 315  *
 316  * \param[in] pmeGPU   The PME GPU structure.
 317  * \param[in] h_grid   The host-side grid buffer.
 318  */
 319 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 320                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 321
 322 /*! \libinternal \brief
 323  * Copies the output real-space grid from the GPU to the host.
 324  *
 325  * \param[in] pmeGPU   The PME GPU structure.
 326  * \param[out] h_grid  The host-side grid buffer.
 327  */
 328 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
 329                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 330
 331 /*! \libinternal \brief
 332  * Waits for the grid copying to the host-side buffer after spreading to finish.
 333  *
 334  * \param[in] pmeGPU  The PME GPU structure.
 335  */
 336 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 337
 338 /*! \libinternal \brief
 339  * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish.
 340  *
 341  * \param[in] pmeGPU  The PME GPU structure.
 342  */
 343 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 344
 345 /*! \libinternal \brief
 346  * Waits for the grid copying to the host-side buffer after solving to finish.
 347  *
 348  * \param[in] pmeGPU  The PME GPU structure.
 349  */
 350 CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 351
 352 /*! \libinternal \brief
 353  * Does the one-time GPU-framework specific PME initialization.
 354  * For CUDA, the PME stream is created with the highest priority.
 355  *
 356  * \param[in] pmeGPU  The PME GPU structure.
 357  */
 358 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 359
 360 /*! \libinternal \brief
 361  * Destroys the PME GPU-framework specific data.
 362  * Should be called last in the PME GPU destructor.
 363  *
 364  * \param[in] pmeGPU  The PME GPU structure.
 365  */
 366 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 367
 368 /*! \libinternal \brief
 369  * Initializes the PME GPU synchronization events.
 370  *
 371  * \param[in] pmeGPU  The PME GPU structure.
 372  */
 373 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 374
 375 /*! \libinternal \brief
 376  * Destroys the PME GPU synchronization events.
 377  *
 378  * \param[in] pmeGPU  The PME GPU structure.
 379  */
 380 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 381
 382 /*! \libinternal \brief
 383  * Initializes the CUDA FFT structures.
 384  *
 385  * \param[in] pmeGPU  The PME GPU structure.
 386  */
 387 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 388
 389 /*! \libinternal \brief
 390  * Destroys the CUDA FFT structures.
 391  *
 392  * \param[in] pmeGPU  The PME GPU structure.
 393  */
 394 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 395
 396 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 397
 398 /*! \libinternal \brief
 399  * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
 400  *
 401  * \param[in] pmeGPU         The PME GPU structure.
 402  */
 403 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 404
 405 /*! \brief
 406  * Resets the PME GPU timings. To be called at the reset step.
 407  *
 408  * \param[in] pmeGPU         The PME GPU structure.
 409  */
 410 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 411
 412 /*! \libinternal \brief
 413  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 414  *
 415  * \param[in] pmeGPU         The PME GPU structure.
 416  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 417  */
 418 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t         *CUDA_FUNC_ARGUMENT(pmeGPU),
 419                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 420
 421 /* The inlined convenience PME GPU status getters */
 422
 423 /*! \libinternal \brief
 424  * Tells if PME runs on multiple GPUs with the decomposition.
 425  *
 426  * \param[in] pmeGPU         The PME GPU structure.
 427  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 428  */
 429 gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU)
 430 {
 431     return !pmeGPU->settings.useDecomposition;
 432 }
 433
 434 /*! \libinternal \brief
 435  * Tells if PME performs the gathering stage on GPU.
 436  *
 437  * \param[in] pmeGPU         The PME GPU structure.
 438  * \returns                  True if the gathering is performed on GPU, false otherwise.
 439  */
 440 gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU)
 441 {
 442     return pmeGPU->settings.performGPUGather;
 443 }
 444
 445 /*! \libinternal \brief
 446  * Tells if PME performs the FFT stages on GPU.
 447  *
 448  * \param[in] pmeGPU         The PME GPU structure.
 449  * \returns                  True if FFT is performed on GPU, false otherwise.
 450  */
 451 gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU)
 452 {
 453     return pmeGPU->settings.performGPUFFT;
 454 }
 455
 456 /*! \libinternal \brief
 457  * Tells if PME performs the grid (un-)wrapping on GPU.
 458  *
 459  * \param[in] pmeGPU         The PME GPU structure.
 460  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 461  */
 462 gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU)
 463 {
 464     return pmeGPU->settings.useDecomposition;
 465 }
 466
 467 /*! \libinternal \brief
 468  * Tells if PME performs the grid solving on GPU.
 469  *
 470  * \param[in] pmeGPU         The PME GPU structure.
 471  * \returns                  True if solving is performed on GPU, false otherwise.
 472  */
 473 gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
 474 {
 475     return pmeGPU->settings.performGPUSolve;
 476 }
 477
 478 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 479
 480 /*! \libinternal \brief
 481  * Returns the output virial and energy of the PME solving.
 482  * Should be called after pme_gpu_finish_step.
 483  *
 484  * \param[in] pmeGPU             The PME GPU structure.
 485  * \param[out] energy            The output energy.
 486  * \param[out] virial            The output virial matrix.
 487  */
 488 void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
 489
 490 /*! \libinternal \brief
 491  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
 492  *
 493  * \param[in] pmeGPU         The PME GPU structure.
 494  * \param[in] box            The unit cell box.
 495  */
 496 void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
 497
 498 /*! \libinternal \brief
 499  * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
 500  *
 501  * \param[in] pmeGPU           The PME GPU structure.
 502  * \param[in] needToUpdateBox  Tells if the stored unit cell parameters should be updated from \p box.
 503  * \param[in] box              The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
 504  *                             Would only be used if \p needToUpdateBox is true.
 505  * \param[in] h_coordinates    Input coordinates (XYZ rvec array).
 506  */
 507 void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
 508
 509 /*! \libinternal \brief
 510  * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
 511  *
 512  * \param[in] pmeGPU         The PME GPU structure.
 513  * \param[in] bCalcForces    The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
 514  * \param[in] bCalcEnerVir   The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
 515  */
 516 void pme_gpu_finish_step(const pme_gpu_t *pmeGPU,  const bool       bCalcForces,
 517                          const bool       bCalcEnerVir);
 518
 519 /*! \libinternal \brief
 520  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 521  *
 522  * \param[in,out] pme       The PME structure.
 523  * \param[in,out] gpuInfo   The GPU information structure.
 524  * \param[in]     mdlog     The logger.
 525  * \param[in]     cr        The communication structure.
 526  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 527  */
 528 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
 529
 530 /*! \libinternal \brief
 531  * Destroys the PME GPU data at the end of the run.
 532  *
 533  * \param[in] pmeGPU     The PME GPU structure.
 534  */
 535 void pme_gpu_destroy(pme_gpu_t *pmeGPU);
 536
 537 /*! \libinternal \brief
 538  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 539  *
 540  * \param[in] pmeGPU    The PME GPU structure.
 541  * \param[in] nAtoms    The number of particles.
 542  * \param[in] charges   The pointer to the host-side array of particle charges.
 543  *
 544  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 545  * Should be called before the pme_gpu_set_io_ranges.
 546  */
 547 void pme_gpu_reinit_atoms(pme_gpu_t        *pmeGPU,
 548                           const int         nAtoms,
 549                           const real       *charges);
 550
 551 #endif