src/gromacs/ewald/pme-gpu-internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme-gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
  51 #include "gromacs/utility/arrayref.h"
  52
  53 #include "pme-gpu-types.h"                     // for the inline functions accessing PmeGpu members
  54
  55 struct gmx_hw_info_t;
  56 struct gmx_gpu_opt_t;
  57 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  58 struct gmx_wallclock_gpu_pme_t;
  59 struct pme_atomcomm_t;
  60 struct t_complex;
  61
  62 namespace gmx
  63 {
  64 class MDLogger;
  65 }
  66
  67 //! Type of spline data
  68 enum class PmeSplineDataType
  69 {
  70     Values,      // theta
  71     Derivatives, // dtheta
  72 };               //TODO move this into new and shiny pme.h (pme-types.h?)
  73
  74 //! PME grid dimension ordering (from major to minor)
  75 enum class GridOrdering
  76 {
  77     YZX,
  78     XYZ
  79 };
  80
  81 /* Some general constants for PME GPU behaviour follow. */
  82
  83 /*! \brief \libinternal
  84  * false: The atom data GPU buffers are sized precisely according to the number of atoms.
  85  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  86  *        The atom index checks in the spread/gather code potentially hinder the performance.
  87  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
  88  *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
  89  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  90  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  91  * \todo Estimate performance differences
  92  */
  93 const bool c_usePadding = true;
  94
  95 /*! \brief \libinternal
  96  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  97  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
  98  *        Could be good for performance in specific systems with lots of neutral atoms.
  99  * \todo Estimate performance differences.
 100  */
 101 const bool c_skipNeutralAtoms = false;
 102
 103 /*! \brief \libinternal
 104  * Number of PME solve output floating point numbers.
 105  * 6 for symmetric virial matrix + 1 for reciprocal energy.
 106  */
 107 const int c_virialAndEnergyCount = 7;
 108
 109 /* A block of CUDA-only functions that live in pme.cu */
 110
 111 /*! \libinternal \brief
 112  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
 113  * Depends on CUDA-specific block sizes, needed for the atom data padding.
 114  *
 115  * \param[in] pmeGPU            The PME GPU structure.
 116  * \returns   Number of atoms in a single GPU atom data chunk.
 117  */
 118 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 119
 120 /*! \libinternal \brief
 121  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 122  *
 123  * \param[in] pmeGPU            The PME GPU structure.
 124  * \returns   Number of atoms in a single GPU atom spline data chunk.
 125  */
 126 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 127
 128 /*! \libinternal \brief
 129  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 130  *
 131  * \param[in] pmeGPU            The PME GPU structure.
 132  */
 133 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 134
 135 /*! \libinternal \brief
 136  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 137  *
 138  * \param[in] pmeGPU            The PME GPU structure.
 139  */
 140 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 141
 142 /*! \libinternal \brief
 143  * Frees the energy and virial memory both on GPU and CPU.
 144  *
 145  * \param[in] pmeGPU            The PME GPU structure.
 146  */
 147 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 148
 149 /*! \libinternal \brief
 150  * Clears the energy and virial memory on GPU with 0.
 151  * Should be called at the end of PME computation which returned energy/virial.
 152  *
 153  * \param[in] pmeGPU            The PME GPU structure.
 154  */
 155 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 156
 157 /*! \libinternal \brief
 158  * Reallocates and copies the pre-computed B-spline values to the GPU.
 159  *
 160  * \param[in] pmeGPU             The PME GPU structure.
 161  */
 162 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 163
 164 /*! \libinternal \brief
 165  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 166  *
 167  * \param[in] pmeGPU             The PME GPU structure.
 168  */
 169 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 170
 171 /*! \libinternal \brief
 172  * Reallocates the GPU buffer for the PME forces.
 173  *
 174  * \param[in] pmeGPU             The PME GPU structure.
 175  */
 176 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 177
 178 /*! \libinternal \brief
 179  * Frees the GPU buffer for the PME forces.
 180  *
 181  * \param[in] pmeGPU             The PME GPU structure.
 182  */
 183 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 184
 185 /*! \libinternal \brief
 186  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 187  * To be called e.g. after the bonded calculations.
 188  *
 189  * \param[in] pmeGPU             The PME GPU structure.
 190  */
 191 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 192
 193 /*! \libinternal \brief
 194  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 195  *
 196  * \param[in] pmeGPU             The PME GPU structure.
 197  */
 198 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 199
 200 /*! \libinternal \brief
 201  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 202  *
 203  * \param[in] pmeGPU            The PME GPU structure.
 204  *
 205  * Needs to be called on every DD step/in the beginning.
 206  */
 207 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 208
 209 /*! \libinternal \brief
 210  * Copies the input coordinates from the CPU buffer onto the GPU.
 211  *
 212  * \param[in] pmeGPU            The PME GPU structure.
 213  * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
 214  *
 215  * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
 216  */
 217 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 218                                                         const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
 219
 220 /*! \libinternal \brief
 221  * Frees the coordinates on the GPU.
 222  *
 223  * \param[in] pmeGPU            The PME GPU structure.
 224  */
 225 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 226
 227 /*! \libinternal \brief
 228  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 229  * Clears the padded part if needed.
 230  *
 231  * \param[in] pmeGPU            The PME GPU structure.
 232  * \param[in] h_coefficients    The input atom charges/coefficients.
 233  *
 234  * Does not need to be done for every PME computation, only whenever the local charges change.
 235  * (So, in the beginning of the run, or on DD step).
 236  */
 237 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 238                                                                      const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
 239
 240 /*! \libinternal \brief
 241  * Frees the charges/coefficients on the GPU.
 242  *
 243  * \param[in] pmeGPU             The PME GPU structure.
 244  */
 245 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 246
 247 /*! \libinternal \brief
 248  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 249  *
 250  * \param[in] pmeGPU            The PME GPU structure.
 251  */
 252 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 253
 254 /*! \libinternal \brief
 255  * Frees the buffers on the GPU for the atoms spline data.
 256  *
 257  * \param[in] pmeGPU            The PME GPU structure.
 258  */
 259 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 260
 261 /*! \libinternal \brief
 262  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 263  *
 264  * \param[in] pmeGPU            The PME GPU structure.
 265  */
 266 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 267
 268 /*! \libinternal \brief
 269  * Frees the buffer on the GPU for the particle gridline indices.
 270  *
 271  * \param[in] pmeGPU            The PME GPU structure.
 272  */
 273 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 274
 275 /*! \libinternal \brief
 276  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 277  *
 278  * \param[in] pmeGPU            The PME GPU structure.
 279  */
 280 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 281
 282 /*! \libinternal \brief
 283  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 284  *
 285  * \param[in] pmeGPU            The PME GPU structure.
 286  */
 287 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 288
 289 /*! \libinternal \brief
 290  * Clears the real space grid on the GPU.
 291  * Should be called at the end of each computation.
 292  *
 293  * \param[in] pmeGPU            The PME GPU structure.
 294  */
 295 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 296
 297 /*! \libinternal \brief
 298  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 299  *
 300  * \param[in] pmeGPU            The PME GPU structure.
 301  */
 302 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 303
 304 /*! \libinternal \brief
 305  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 306  *
 307  * \param[in] pmeGPU            The PME GPU structure.
 308  */
 309 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 310
 311 /*! \libinternal \brief
 312  * Copies the input real-space grid from the host to the GPU.
 313  *
 314  * \param[in] pmeGPU   The PME GPU structure.
 315  * \param[in] h_grid   The host-side grid buffer.
 316  */
 317 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 318                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 319
 320 /*! \libinternal \brief
 321  * Copies the output real-space grid from the GPU to the host.
 322  *
 323  * \param[in] pmeGPU   The PME GPU structure.
 324  * \param[out] h_grid  The host-side grid buffer.
 325  */
 326 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 327                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 328
 329 /*! \libinternal \brief
 330  * Copies the spread output spline data and gridline indices from the GPU to the host.
 331  *
 332  * \param[in] pmeGPU   The PME GPU structure.
 333  */
 334 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 335
 336 /*! \libinternal \brief
 337  * Copies the gather input spline data and gridline indices from the host to the GPU.
 338  *
 339  * \param[in] pmeGPU   The PME GPU structure.
 340  */
 341 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 342
 343 /*! \libinternal \brief
 344  * Waits for the grid copying to the host-side buffer after spreading to finish.
 345  *
 346  * \param[in] pmeGPU  The PME GPU structure.
 347  */
 348 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 349
 350 /*! \libinternal \brief
 351  * Does the one-time GPU-framework specific PME initialization.
 352  * For CUDA, the PME stream is created with the highest priority.
 353  *
 354  * \param[in] pmeGPU  The PME GPU structure.
 355  */
 356 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 357
 358 /*! \libinternal \brief
 359  * Destroys the PME GPU-framework specific data.
 360  * Should be called last in the PME GPU destructor.
 361  *
 362  * \param[in] pmeGPU  The PME GPU structure.
 363  */
 364 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 365
 366 /*! \libinternal \brief
 367  * Initializes the PME GPU synchronization events.
 368  *
 369  * \param[in] pmeGPU  The PME GPU structure.
 370  */
 371 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 372
 373 /*! \libinternal \brief
 374  * Destroys the PME GPU synchronization events.
 375  *
 376  * \param[in] pmeGPU  The PME GPU structure.
 377  */
 378 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 379
 380 /*! \libinternal \brief
 381  * Initializes the CUDA FFT structures.
 382  *
 383  * \param[in] pmeGPU  The PME GPU structure.
 384  */
 385 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 386
 387 /*! \libinternal \brief
 388  * Destroys the CUDA FFT structures.
 389  *
 390  * \param[in] pmeGPU  The PME GPU structure.
 391  */
 392 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 393
 394 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 395
 396 /*! \libinternal \brief
 397  * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
 398  *
 399  * \param[in] pmeGPU         The PME GPU structure.
 400  */
 401 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 402
 403 /*! \libinternal \brief
 404  * Updates the internal list of active PME GPU stages (if timings are enabled).
 405  *
 406  * \param[in] pmeGPU         The PME GPU data structure.
 407  */
 408 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 409
 410 /*! \brief
 411  * Resets the PME GPU timings. To be called at the reset MD step.
 412  *
 413  * \param[in] pmeGPU         The PME GPU structure.
 414  */
 415 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 416
 417 /*! \libinternal \brief
 418  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 419  *
 420  * \param[in] pmeGPU         The PME GPU structure.
 421  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 422  */
 423 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const PmeGpu            *CUDA_FUNC_ARGUMENT(pmeGPU),
 424                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 425
 426 /* The PME stages themselves */
 427
 428 /*! \libinternal \brief
 429  * A GPU spline computation and charge spreading function.
 430  *
 431  * \param[in]  pmeGpu          The PME GPU structure.
 432  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 433  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 434  *                             e.g. testing or host-side FFT)
 435  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 436  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 437  */
 438 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 439                                         int              CUDA_FUNC_ARGUMENT(gridIndex),
 440                                         real            *CUDA_FUNC_ARGUMENT(h_grid),
 441                                         bool             CUDA_FUNC_ARGUMENT(computeSplines),
 442                                         bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
 443
 444 /*! \libinternal \brief
 445  * 3D FFT R2C/C2R routine.
 446  *
 447  * \param[in]  pmeGpu          The PME GPU structure.
 448  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 449  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 450  */
 451 CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
 452                                        enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction),
 453                                        const int              CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM
 454
 455 /*! \libinternal \brief
 456  * A GPU Fourier space solving function.
 457  *
 458  * \param[in]     pmeGpu                  The PME GPU structure.
 459  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 460  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 461  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should also be performed.
 462  */
 463 CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 464                                        t_complex       *CUDA_FUNC_ARGUMENT(h_grid),
 465                                        GridOrdering     CUDA_FUNC_ARGUMENT(gridOrdering),
 466                                        bool             CUDA_FUNC_ARGUMENT(computeEnergyAndVirial)) CUDA_FUNC_TERM
 467
 468 /*! \libinternal \brief
 469  * A GPU force gathering function.
 470  *
 471  * \param[in]     pmeGpu           The PME GPU structure.
 472  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
 473  *                                 TODO: determine efficiency/balance of host/device-side reductions.
 474  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
 475  */
 476 CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *CUDA_FUNC_ARGUMENT(pmeGpu),
 477                                         PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
 478                                         const float           *CUDA_FUNC_ARGUMENT(h_grid)
 479                                         ) CUDA_FUNC_TERM
 480
 481
 482 /* The inlined convenience PME GPU status getters */
 483
 484 /*! \libinternal \brief
 485  * Tells if PME runs on multiple GPUs with the decomposition.
 486  *
 487  * \param[in] pmeGPU         The PME GPU structure.
 488  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 489  */
 490 gmx_inline bool pme_gpu_uses_dd(const PmeGpu *pmeGPU)
 491 {
 492     return !pmeGPU->settings.useDecomposition;
 493 }
 494
 495 /*! \libinternal \brief
 496  * Tells if PME performs the gathering stage on GPU.
 497  *
 498  * \param[in] pmeGPU         The PME GPU structure.
 499  * \returns                  True if the gathering is performed on GPU, false otherwise.
 500  */
 501 gmx_inline bool pme_gpu_performs_gather(const PmeGpu *pmeGPU)
 502 {
 503     return pmeGPU->settings.performGPUGather;
 504 }
 505
 506 /*! \libinternal \brief
 507  * Tells if PME performs the FFT stages on GPU.
 508  *
 509  * \param[in] pmeGPU         The PME GPU structure.
 510  * \returns                  True if FFT is performed on GPU, false otherwise.
 511  */
 512 gmx_inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGPU)
 513 {
 514     return pmeGPU->settings.performGPUFFT;
 515 }
 516
 517 /*! \libinternal \brief
 518  * Tells if PME performs the grid (un-)wrapping on GPU.
 519  *
 520  * \param[in] pmeGPU         The PME GPU structure.
 521  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 522  */
 523 gmx_inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGPU)
 524 {
 525     return pmeGPU->settings.useDecomposition;
 526 }
 527
 528 /*! \libinternal \brief
 529  * Tells if PME performs the grid solving on GPU.
 530  *
 531  * \param[in] pmeGPU         The PME GPU structure.
 532  * \returns                  True if solving is performed on GPU, false otherwise.
 533  */
 534 gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
 535 {
 536     return pmeGPU->settings.performGPUSolve;
 537 }
 538
 539 /*! \libinternal \brief
 540  * Enables or disables the testing mode.
 541  * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
 542  * and also makes the copies synchronous.
 543  *
 544  * \param[in] pmeGPU             The PME GPU structure.
 545  * \param[in] testing            Should the testing mode be enabled, or disabled.
 546  */
 547 gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing)
 548 {
 549     pmeGPU->settings.copyAllOutputs = testing;
 550     pmeGPU->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 551 }
 552
 553 /*! \libinternal \brief
 554  * Tells if PME is in the testing mode.
 555  *
 556  * \param[in] pmeGPU             The PME GPU structure.
 557  * \returns                      true if testing mode is enabled, false otherwise.
 558  */
 559 gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGPU)
 560 {
 561     return pmeGPU->settings.copyAllOutputs;
 562 }
 563
 564 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 565
 566 /*! \libinternal \brief
 567  * Returns the GPU gathering staging forces buffer.
 568  *
 569  * \param[in] pmeGPU             The PME GPU structure.
 570  * \returns                      The input/output forces.
 571  */
 572 gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU);
 573
 574 /*! \libinternal \brief
 575  * Returns the output virial and energy of the PME solving.
 576  * Should be called after pme_gpu_finish_computation.
 577  *
 578  * \param[in] pmeGPU             The PME GPU structure.
 579  * \param[out] energy            The output energy.
 580  * \param[out] virial            The output virial matrix.
 581  */
 582 void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial);
 583
 584 /*! \libinternal \brief
 585  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 586  *
 587  * \param[in] pmeGPU         The PME GPU structure.
 588  * \param[in] box            The unit cell box.
 589  */
 590 void pme_gpu_update_input_box(PmeGpu *pmeGPU, const matrix box);
 591
 592 /*! \libinternal \brief
 593  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 594  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 595  * If virial/energy were computed, they will have arrived into the internal staging buffer
 596  * (even though that should have already happened before even launching the gather).
 597  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 598  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 599  *
 600  * \param[in] pmeGPU         The PME GPU structure.
 601  */
 602 void pme_gpu_finish_computation(const PmeGpu *pmeGPU);
 603
 604 //! A binary enum for spline data layout transformation
 605 enum class PmeLayoutTransform
 606 {
 607     GpuToHost,
 608     HostToGpu
 609 };
 610
 611 /*! \libinternal \brief
 612  * Rearranges the atom spline data between the GPU and host layouts.
 613  * Only used for test purposes so far, likely to be horribly slow.
 614  *
 615  * \param[in]  pmeGPU     The PME GPU structure.
 616  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 617  * \param[in]  type       The spline data type (values or derivatives).
 618  * \param[in]  dimIndex   Dimension index.
 619  * \param[in]  transform  Layout transform type
 620  */
 621 void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGPU, const pme_atomcomm_t *atc,
 622                                         PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
 623
 624 /*! \libinternal \brief
 625  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 626  *
 627  * \param[in] pmeGPU             The PME GPU structure.
 628  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 629  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 630  */
 631 void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
 632
 633 /*! \libinternal \brief
 634  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 635  *
 636  * \param[in,out] pme       The PME structure.
 637  * \param[in,out] gpuInfo   The GPU information structure.
 638  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 639  */
 640 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
 641
 642 /*! \libinternal \brief
 643  * Destroys the PME GPU data at the end of the run.
 644  *
 645  * \param[in] pmeGPU     The PME GPU structure.
 646  */
 647 void pme_gpu_destroy(PmeGpu *pmeGPU);
 648
 649 /*! \libinternal \brief
 650  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 651  *
 652  * \param[in] pmeGPU    The PME GPU structure.
 653  * \param[in] nAtoms    The number of particles.
 654  * \param[in] charges   The pointer to the host-side array of particle charges.
 655  *
 656  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 657  * Should be called before the pme_gpu_set_io_ranges.
 658  */
 659 void pme_gpu_reinit_atoms(PmeGpu           *pmeGPU,
 660                           const int         nAtoms,
 661                           const real       *charges);
 662
 663 #endif