src/gromacs/ewald/pme-gpu-internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme-gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
  51 #include "gromacs/utility/arrayref.h"
  52
  53 #include "pme-gpu-types.h"                     // for the inline functions accessing PmeGpu members
  54
  55 struct gmx_hw_info_t;
  56 struct gmx_gpu_opt_t;
  57 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  58 struct gmx_wallclock_gpu_pme_t;
  59 struct pme_atomcomm_t;
  60 struct t_complex;
  61
  62 namespace gmx
  63 {
  64 class MDLogger;
  65 }
  66
  67 //! Type of spline data
  68 enum class PmeSplineDataType
  69 {
  70     Values,      // theta
  71     Derivatives, // dtheta
  72 };               //TODO move this into new and shiny pme.h (pme-types.h?)
  73
  74 //! PME grid dimension ordering (from major to minor)
  75 enum class GridOrdering
  76 {
  77     YZX,
  78     XYZ
  79 };
  80
  81 /* Some general constants for PME GPU behaviour follow. */
  82
  83 /*! \brief \libinternal
  84  * false: The atom data GPU buffers are sized precisely according to the number of atoms.
  85  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  86  *        The atom index checks in the spread/gather code potentially hinder the performance.
  87  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
  88  *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
  89  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  90  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  91  * \todo Estimate performance differences
  92  */
  93 const bool c_usePadding = true;
  94
  95 /*! \brief \libinternal
  96  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  97  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
  98  *        Could be good for performance in specific systems with lots of neutral atoms.
  99  * \todo Estimate performance differences.
 100  */
 101 const bool c_skipNeutralAtoms = false;
 102
 103 /*! \brief \libinternal
 104  * Number of PME solve output floating point numbers.
 105  * 6 for symmetric virial matrix + 1 for reciprocal energy.
 106  */
 107 const int c_virialAndEnergyCount = 7;
 108
 109 /* A block of CUDA-only functions that live in pme.cu */
 110
 111 /*! \libinternal \brief
 112  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
 113  * Depends on CUDA-specific block sizes, needed for the atom data padding.
 114  *
 115  * \param[in] pmeGpu            The PME GPU structure.
 116  * \returns   Number of atoms in a single GPU atom data chunk.
 117  */
 118 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(1)
 119
 120 /*! \libinternal \brief
 121  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 122  *
 123  * \param[in] pmeGpu            The PME GPU structure.
 124  * \returns   Number of atoms in a single GPU atom spline data chunk.
 125  */
 126 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(1)
 127
 128 /*! \libinternal \brief
 129  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 130  *
 131  * \param[in] pmeGpu            The PME GPU structure.
 132  */
 133 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 134
 135 /*! \libinternal \brief
 136  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 137  *
 138  * \param[in] pmeGpu            The PME GPU structure.
 139  */
 140 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 141
 142 /*! \libinternal \brief
 143  * Frees the energy and virial memory both on GPU and CPU.
 144  *
 145  * \param[in] pmeGpu            The PME GPU structure.
 146  */
 147 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 148
 149 /*! \libinternal \brief
 150  * Clears the energy and virial memory on GPU with 0.
 151  * Should be called at the end of PME computation which returned energy/virial.
 152  *
 153  * \param[in] pmeGpu            The PME GPU structure.
 154  */
 155 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 156
 157 /*! \libinternal \brief
 158  * Reallocates and copies the pre-computed B-spline values to the GPU.
 159  *
 160  * \param[in] pmeGpu             The PME GPU structure.
 161  */
 162 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 163
 164 /*! \libinternal \brief
 165  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 166  *
 167  * \param[in] pmeGpu             The PME GPU structure.
 168  */
 169 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 170
 171 /*! \libinternal \brief
 172  * Reallocates the GPU buffer for the PME forces.
 173  *
 174  * \param[in] pmeGpu             The PME GPU structure.
 175  */
 176 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 177
 178 /*! \libinternal \brief
 179  * Frees the GPU buffer for the PME forces.
 180  *
 181  * \param[in] pmeGpu             The PME GPU structure.
 182  */
 183 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 184
 185 /*! \libinternal \brief
 186  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 187  * To be called e.g. after the bonded calculations.
 188  *
 189  * \param[in] pmeGpu             The PME GPU structure.
 190  */
 191 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 192
 193 /*! \libinternal \brief
 194  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 195  *
 196  * \param[in] pmeGpu             The PME GPU structure.
 197  */
 198 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 199
 200 /*! \libinternal \brief
 201  * Checks whether work in the PME GPU stream has completed.
 202  *
 203  * \param[in] pmeGpu            The PME GPU structure.
 204  *
 205  * \returns                     True if work in the PME stream has completed.
 206  */
 207 CUDA_FUNC_QUALIFIER bool pme_gpu_stream_query(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(0)
 208
 209 /*! \libinternal \brief
 210  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 211  *
 212  * \param[in] pmeGpu            The PME GPU structure.
 213  *
 214  * Needs to be called on every DD step/in the beginning.
 215  */
 216 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 217
 218 /*! \libinternal \brief
 219  * Copies the input coordinates from the CPU buffer onto the GPU.
 220  *
 221  * \param[in] pmeGpu            The PME GPU structure.
 222  * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
 223  *
 224  * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
 225  */
 226 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 227                                                         const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
 228
 229 /*! \libinternal \brief
 230  * Frees the coordinates on the GPU.
 231  *
 232  * \param[in] pmeGpu            The PME GPU structure.
 233  */
 234 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 235
 236 /*! \libinternal \brief
 237  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 238  * Clears the padded part if needed.
 239  *
 240  * \param[in] pmeGpu            The PME GPU structure.
 241  * \param[in] h_coefficients    The input atom charges/coefficients.
 242  *
 243  * Does not need to be done for every PME computation, only whenever the local charges change.
 244  * (So, in the beginning of the run, or on DD step).
 245  */
 246 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 247                                                                      const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
 248
 249 /*! \libinternal \brief
 250  * Frees the charges/coefficients on the GPU.
 251  *
 252  * \param[in] pmeGpu             The PME GPU structure.
 253  */
 254 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 255
 256 /*! \libinternal \brief
 257  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 258  *
 259  * \param[in] pmeGpu            The PME GPU structure.
 260  */
 261 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 262
 263 /*! \libinternal \brief
 264  * Frees the buffers on the GPU for the atoms spline data.
 265  *
 266  * \param[in] pmeGpu            The PME GPU structure.
 267  */
 268 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 269
 270 /*! \libinternal \brief
 271  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 272  *
 273  * \param[in] pmeGpu            The PME GPU structure.
 274  */
 275 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 276
 277 /*! \libinternal \brief
 278  * Frees the buffer on the GPU for the particle gridline indices.
 279  *
 280  * \param[in] pmeGpu            The PME GPU structure.
 281  */
 282 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 283
 284 /*! \libinternal \brief
 285  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 286  *
 287  * \param[in] pmeGpu            The PME GPU structure.
 288  */
 289 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 290
 291 /*! \libinternal \brief
 292  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 293  *
 294  * \param[in] pmeGpu            The PME GPU structure.
 295  */
 296 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 297
 298 /*! \libinternal \brief
 299  * Clears the real space grid on the GPU.
 300  * Should be called at the end of each computation.
 301  *
 302  * \param[in] pmeGpu            The PME GPU structure.
 303  */
 304 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 305
 306 /*! \libinternal \brief
 307  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 308  *
 309  * \param[in] pmeGpu            The PME GPU structure.
 310  */
 311 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 312
 313 /*! \libinternal \brief
 314  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 315  *
 316  * \param[in] pmeGpu            The PME GPU structure.
 317  */
 318 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 319
 320 /*! \libinternal \brief
 321  * Copies the input real-space grid from the host to the GPU.
 322  *
 323  * \param[in] pmeGpu   The PME GPU structure.
 324  * \param[in] h_grid   The host-side grid buffer.
 325  */
 326 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 327                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 328
 329 /*! \libinternal \brief
 330  * Copies the output real-space grid from the GPU to the host.
 331  *
 332  * \param[in] pmeGpu   The PME GPU structure.
 333  * \param[out] h_grid  The host-side grid buffer.
 334  */
 335 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 336                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 337
 338 /*! \libinternal \brief
 339  * Copies the spread output spline data and gridline indices from the GPU to the host.
 340  *
 341  * \param[in] pmeGpu   The PME GPU structure.
 342  */
 343 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 344
 345 /*! \libinternal \brief
 346  * Copies the gather input spline data and gridline indices from the host to the GPU.
 347  *
 348  * \param[in] pmeGpu   The PME GPU structure.
 349  */
 350 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 351
 352 /*! \libinternal \brief
 353  * Waits for the grid copying to the host-side buffer after spreading to finish.
 354  *
 355  * \param[in] pmeGpu  The PME GPU structure.
 356  */
 357 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 358
 359 /*! \libinternal \brief
 360  * Does the one-time GPU-framework specific PME initialization.
 361  * For CUDA, the PME stream is created with the highest priority.
 362  *
 363  * \param[in] pmeGpu  The PME GPU structure.
 364  */
 365 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 366
 367 /*! \libinternal \brief
 368  * Destroys the PME GPU-framework specific data.
 369  * Should be called last in the PME GPU destructor.
 370  *
 371  * \param[in] pmeGpu  The PME GPU structure.
 372  */
 373 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 374
 375 /*! \libinternal \brief
 376  * Initializes the PME GPU synchronization events.
 377  *
 378  * \param[in] pmeGpu  The PME GPU structure.
 379  */
 380 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 381
 382 /*! \libinternal \brief
 383  * Destroys the PME GPU synchronization events.
 384  *
 385  * \param[in] pmeGpu  The PME GPU structure.
 386  */
 387 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 388
 389 /*! \libinternal \brief
 390  * Initializes the CUDA FFT structures.
 391  *
 392  * \param[in] pmeGpu  The PME GPU structure.
 393  */
 394 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 395
 396 /*! \libinternal \brief
 397  * Destroys the CUDA FFT structures.
 398  *
 399  * \param[in] pmeGpu  The PME GPU structure.
 400  */
 401 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 402
 403 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 404
 405 /*! \libinternal \brief
 406  * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
 407  *
 408  * \param[in] pmeGpu         The PME GPU structure.
 409  */
 410 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 411
 412 /*! \libinternal \brief
 413  * Updates the internal list of active PME GPU stages (if timings are enabled).
 414  *
 415  * \param[in] pmeGpu         The PME GPU data structure.
 416  */
 417 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 418
 419 /*! \brief
 420  * Resets the PME GPU timings. To be called at the reset MD step.
 421  *
 422  * \param[in] pmeGpu         The PME GPU structure.
 423  */
 424 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 425
 426 /*! \libinternal \brief
 427  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 428  *
 429  * \param[in] pmeGpu         The PME GPU structure.
 430  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 431  */
 432 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const PmeGpu            *CUDA_FUNC_ARGUMENT(pmeGpu),
 433                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 434
 435 /* The PME stages themselves */
 436
 437 /*! \libinternal \brief
 438  * A GPU spline computation and charge spreading function.
 439  *
 440  * \param[in]  pmeGpu          The PME GPU structure.
 441  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 442  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 443  *                             e.g. testing or host-side FFT)
 444  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 445  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 446  */
 447 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 448                                         int              CUDA_FUNC_ARGUMENT(gridIndex),
 449                                         real            *CUDA_FUNC_ARGUMENT(h_grid),
 450                                         bool             CUDA_FUNC_ARGUMENT(computeSplines),
 451                                         bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
 452
 453 /*! \libinternal \brief
 454  * 3D FFT R2C/C2R routine.
 455  *
 456  * \param[in]  pmeGpu          The PME GPU structure.
 457  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 458  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 459  */
 460 CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
 461                                        enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction),
 462                                        const int              CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM
 463
 464 /*! \libinternal \brief
 465  * A GPU Fourier space solving function.
 466  *
 467  * \param[in]     pmeGpu                  The PME GPU structure.
 468  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 469  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 470  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should also be performed.
 471  */
 472 CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 473                                        t_complex       *CUDA_FUNC_ARGUMENT(h_grid),
 474                                        GridOrdering     CUDA_FUNC_ARGUMENT(gridOrdering),
 475                                        bool             CUDA_FUNC_ARGUMENT(computeEnergyAndVirial)) CUDA_FUNC_TERM
 476
 477 /*! \libinternal \brief
 478  * A GPU force gathering function.
 479  *
 480  * \param[in]     pmeGpu           The PME GPU structure.
 481  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
 482  *                                 TODO: determine efficiency/balance of host/device-side reductions.
 483  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
 484  */
 485 CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *CUDA_FUNC_ARGUMENT(pmeGpu),
 486                                         PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
 487                                         const float           *CUDA_FUNC_ARGUMENT(h_grid)
 488                                         ) CUDA_FUNC_TERM
 489
 490
 491 /* The inlined convenience PME GPU status getters */
 492
 493 /*! \libinternal \brief
 494  * Tells if PME runs on multiple GPUs with the decomposition.
 495  *
 496  * \param[in] pmeGpu         The PME GPU structure.
 497  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 498  */
 499 gmx_inline bool pme_gpu_uses_dd(const PmeGpu *pmeGpu)
 500 {
 501     return !pmeGpu->settings.useDecomposition;
 502 }
 503
 504 /*! \libinternal \brief
 505  * Tells if PME performs the gathering stage on GPU.
 506  *
 507  * \param[in] pmeGpu         The PME GPU structure.
 508  * \returns                  True if the gathering is performed on GPU, false otherwise.
 509  */
 510 gmx_inline bool pme_gpu_performs_gather(const PmeGpu *pmeGpu)
 511 {
 512     return pmeGpu->settings.performGPUGather;
 513 }
 514
 515 /*! \libinternal \brief
 516  * Tells if PME performs the FFT stages on GPU.
 517  *
 518  * \param[in] pmeGpu         The PME GPU structure.
 519  * \returns                  True if FFT is performed on GPU, false otherwise.
 520  */
 521 gmx_inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGpu)
 522 {
 523     return pmeGpu->settings.performGPUFFT;
 524 }
 525
 526 /*! \libinternal \brief
 527  * Tells if PME performs the grid (un-)wrapping on GPU.
 528  *
 529  * \param[in] pmeGpu         The PME GPU structure.
 530  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 531  */
 532 gmx_inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGpu)
 533 {
 534     return pmeGpu->settings.useDecomposition;
 535 }
 536
 537 /*! \libinternal \brief
 538  * Tells if PME performs the grid solving on GPU.
 539  *
 540  * \param[in] pmeGpu         The PME GPU structure.
 541  * \returns                  True if solving is performed on GPU, false otherwise.
 542  */
 543 gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGpu)
 544 {
 545     return pmeGpu->settings.performGPUSolve;
 546 }
 547
 548 /*! \libinternal \brief
 549  * Enables or disables the testing mode.
 550  * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
 551  * and also makes the copies synchronous.
 552  *
 553  * \param[in] pmeGpu             The PME GPU structure.
 554  * \param[in] testing            Should the testing mode be enabled, or disabled.
 555  */
 556 gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGpu, bool testing)
 557 {
 558     pmeGpu->settings.copyAllOutputs = testing;
 559     pmeGpu->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 560 }
 561
 562 /*! \libinternal \brief
 563  * Tells if PME is in the testing mode.
 564  *
 565  * \param[in] pmeGpu             The PME GPU structure.
 566  * \returns                      true if testing mode is enabled, false otherwise.
 567  */
 568 gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGpu)
 569 {
 570     return pmeGpu->settings.copyAllOutputs;
 571 }
 572
 573 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 574
 575 /*! \libinternal \brief
 576  * Returns the GPU gathering staging forces buffer.
 577  *
 578  * \param[in] pmeGpu             The PME GPU structure.
 579  * \returns                      The input/output forces.
 580  */
 581 gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu);
 582
 583 /*! \libinternal \brief
 584  * Returns the output virial and energy of the PME solving.
 585  *
 586  * \param[in] pmeGpu             The PME GPU structure.
 587  * \param[out] energy            The output energy.
 588  * \param[out] virial            The output virial matrix.
 589  */
 590 void pme_gpu_get_energy_virial(const PmeGpu *pmeGpu, real *energy, matrix virial);
 591
 592 /*! \libinternal \brief
 593  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 594  *
 595  * \param[in] pmeGpu         The PME GPU structure.
 596  * \param[in] box            The unit cell box.
 597  */
 598 void pme_gpu_update_input_box(PmeGpu *pmeGpu, const matrix box);
 599
 600 /*! \libinternal \brief
 601  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 602  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 603  * If virial/energy were computed, they will have arrived into the internal staging buffer
 604  * (even though that should have already happened before even launching the gather).
 605  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 606  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 607  *
 608  * \param[in] pmeGpu         The PME GPU structure.
 609  */
 610 void pme_gpu_finish_computation(const PmeGpu *pmeGpu);
 611
 612 //! A binary enum for spline data layout transformation
 613 enum class PmeLayoutTransform
 614 {
 615     GpuToHost,
 616     HostToGpu
 617 };
 618
 619 /*! \libinternal \brief
 620  * Rearranges the atom spline data between the GPU and host layouts.
 621  * Only used for test purposes so far, likely to be horribly slow.
 622  *
 623  * \param[in]  pmeGpu     The PME GPU structure.
 624  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 625  * \param[in]  type       The spline data type (values or derivatives).
 626  * \param[in]  dimIndex   Dimension index.
 627  * \param[in]  transform  Layout transform type
 628  */
 629 void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGpu, const pme_atomcomm_t *atc,
 630                                         PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
 631
 632 /*! \libinternal \brief
 633  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 634  *
 635  * \param[in] pmeGpu             The PME GPU structure.
 636  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 637  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 638  */
 639 void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
 640
 641 /*! \libinternal \brief
 642  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 643  *
 644  * \param[in,out] pme       The PME structure.
 645  * \param[in,out] gpuInfo   The GPU information structure.
 646  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 647  */
 648 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
 649
 650 /*! \libinternal \brief
 651  * Destroys the PME GPU data at the end of the run.
 652  *
 653  * \param[in] pmeGpu     The PME GPU structure.
 654  */
 655 void pme_gpu_destroy(PmeGpu *pmeGpu);
 656
 657 /*! \libinternal \brief
 658  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 659  *
 660  * \param[in] pmeGpu    The PME GPU structure.
 661  * \param[in] nAtoms    The number of particles.
 662  * \param[in] charges   The pointer to the host-side array of particle charges.
 663  *
 664  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 665  * Should be called before the pme_gpu_set_io_ranges.
 666  */
 667 void pme_gpu_reinit_atoms(PmeGpu           *pmeGpu,
 668                           const int         nAtoms,
 669                           const real       *charges);
 670
 671 /*! \brief \libinternal
 672  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
 673  *
 674  * This clears the device-side working buffers in preparation for new computation.
 675  *
 676  * \param[in] pmeGpu            The PME GPU structure.
 677  */
 678 void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);
 679
 680
 681 #endif