src/gromacs/ewald/pme-gpu-internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme-gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
  51
  52 #include "pme-gpu-types.h"                     // for the inline functions accessing PmeGpu members
  53
  54 struct gmx_hw_info_t;
  55 struct gmx_gpu_opt_t;
  56 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  57 struct t_commrec;
  58 struct gmx_wallclock_gpu_pme_t;
  59 struct pme_atomcomm_t;
  60 struct t_complex;
  61
  62 namespace gmx
  63 {
  64 class MDLogger;
  65 }
  66
  67 //! Type of spline data
  68 enum class PmeSplineDataType
  69 {
  70     Values,      // theta
  71     Derivatives, // dtheta
  72 };               //TODO move this into new and shiny pme.h (pme-types.h?)
  73
  74 //! PME grid dimension ordering (from major to minor)
  75 enum class GridOrdering
  76 {
  77     YZX,
  78     XYZ
  79 };
  80
  81 /* Some general constants for PME GPU behaviour follow. */
  82
  83 /*! \brief \libinternal
  84  * false: The atom data GPU buffers are sized precisely according to the number of atoms.
  85  *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
  86  *        The atom index checks in the spread/gather code potentially hinder the performance.
  87  * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
  88  *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
  89  *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
  90  *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
  91  * \todo Estimate performance differences
  92  */
  93 const bool c_usePadding = true;
  94
  95 /*! \brief \libinternal
  96  * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
  97  * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
  98  *        Could be good for performance in specific systems with lots of neutral atoms.
  99  * \todo Estimate performance differences.
 100  */
 101 const bool c_skipNeutralAtoms = false;
 102
 103 /*! \brief \libinternal
 104  * Number of PME solve output floating point numbers.
 105  * 6 for symmetric virial matrix + 1 for reciprocal energy.
 106  */
 107 const int c_virialAndEnergyCount = 7;
 108
 109 /* A block of CUDA-only functions that live in pme.cu */
 110
 111 /*! \libinternal \brief
 112  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
 113  * Depends on CUDA-specific block sizes, needed for the atom data padding.
 114  *
 115  * \param[in] pmeGPU            The PME GPU structure.
 116  * \returns   Number of atoms in a single GPU atom data chunk.
 117  */
 118 CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 119
 120 /*! \libinternal \brief
 121  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 122  *
 123  * \param[in] pmeGPU            The PME GPU structure.
 124  * \returns   Number of atoms in a single GPU atom spline data chunk.
 125  */
 126 CUDA_FUNC_QUALIFIER int pme_gpu_get_atoms_per_warp(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
 127
 128 /*! \libinternal \brief
 129  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 130  *
 131  * \param[in] pmeGPU            The PME GPU structure.
 132  */
 133 CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 134
 135 /*! \libinternal \brief
 136  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 137  *
 138  * \param[in] pmeGPU            The PME GPU structure.
 139  */
 140 CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 141
 142 /*! \libinternal \brief
 143  * Frees the energy and virial memory both on GPU and CPU.
 144  *
 145  * \param[in] pmeGPU            The PME GPU structure.
 146  */
 147 CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 148
 149 /*! \libinternal \brief
 150  * Clears the energy and virial memory on GPU with 0.
 151  * Should be called at the end of PME computation which returned energy/virial.
 152  *
 153  * \param[in] pmeGPU            The PME GPU structure.
 154  */
 155 CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 156
 157 /*! \libinternal \brief
 158  * Reallocates and copies the pre-computed B-spline values to the GPU.
 159  *
 160  * \param[in] pmeGPU             The PME GPU structure.
 161  */
 162 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 163
 164 /*! \libinternal \brief
 165  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 166  *
 167  * \param[in] pmeGPU             The PME GPU structure.
 168  */
 169 CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 170
 171 /*! \libinternal \brief
 172  * Reallocates the GPU buffer for the PME forces.
 173  *
 174  * \param[in] pmeGPU             The PME GPU structure.
 175  */
 176 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 177
 178 /*! \libinternal \brief
 179  * Frees the GPU buffer for the PME forces.
 180  *
 181  * \param[in] pmeGPU             The PME GPU structure.
 182  */
 183 CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 184
 185 /*! \libinternal \brief
 186  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 187  * To be called e.g. after the bonded calculations.
 188  *
 189  * \param[in] pmeGPU             The PME GPU structure.
 190  * \param[in] h_forces           The input forces rvec buffer.
 191  */
 192 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 193                                                    const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 194
 195 /*! \libinternal \brief
 196  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 197  *
 198  * \param[in] pmeGPU             The PME GPU structure.
 199  * \param[out] h_forces          The output forces rvec buffer.
 200  */
 201 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 202                                                     float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
 203
 204 /*! \libinternal \brief
 205  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 206  *
 207  * \param[in] pmeGPU            The PME GPU structure.
 208  *
 209  * Needs to be called on every DD step/in the beginning.
 210  */
 211 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 212
 213 /*! \libinternal \brief
 214  * Copies the input coordinates from the CPU buffer onto the GPU.
 215  *
 216  * \param[in] pmeGPU            The PME GPU structure.
 217  * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
 218  *
 219  * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
 220  */
 221 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 222                                                         const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
 223
 224 /*! \libinternal \brief
 225  * Frees the coordinates on the GPU.
 226  *
 227  * \param[in] pmeGPU            The PME GPU structure.
 228  */
 229 CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 230
 231 /*! \libinternal \brief
 232  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 233  * Clears the padded part if needed.
 234  *
 235  * \param[in] pmeGPU            The PME GPU structure.
 236  * \param[in] h_coefficients    The input atom charges/coefficients.
 237  *
 238  * Does not need to be done for every PME computation, only whenever the local charges change.
 239  * (So, in the beginning of the run, or on DD step).
 240  */
 241 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 242                                                                      const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
 243
 244 /*! \libinternal \brief
 245  * Frees the charges/coefficients on the GPU.
 246  *
 247  * \param[in] pmeGPU             The PME GPU structure.
 248  */
 249 CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 250
 251 /*! \libinternal \brief
 252  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 253  *
 254  * \param[in] pmeGPU            The PME GPU structure.
 255  */
 256 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 257
 258 /*! \libinternal \brief
 259  * Frees the buffers on the GPU for the atoms spline data.
 260  *
 261  * \param[in] pmeGPU            The PME GPU structure.
 262  */
 263 CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 264
 265 /*! \libinternal \brief
 266  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 267  *
 268  * \param[in] pmeGPU            The PME GPU structure.
 269  */
 270 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 271
 272 /*! \libinternal \brief
 273  * Frees the buffer on the GPU for the particle gridline indices.
 274  *
 275  * \param[in] pmeGPU            The PME GPU structure.
 276  */
 277 CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 278
 279 /*! \libinternal \brief
 280  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 281  *
 282  * \param[in] pmeGPU            The PME GPU structure.
 283  */
 284 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 285
 286 /*! \libinternal \brief
 287  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 288  *
 289  * \param[in] pmeGPU            The PME GPU structure.
 290  */
 291 CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 292
 293 /*! \libinternal \brief
 294  * Clears the real space grid on the GPU.
 295  * Should be called at the end of each computation.
 296  *
 297  * \param[in] pmeGPU            The PME GPU structure.
 298  */
 299 CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 300
 301 /*! \libinternal \brief
 302  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 303  *
 304  * \param[in] pmeGPU            The PME GPU structure.
 305  */
 306 CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 307
 308 /*! \libinternal \brief
 309  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 310  *
 311  * \param[in] pmeGPU            The PME GPU structure.
 312  */
 313 CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 314
 315 /*! \libinternal \brief
 316  * Copies the input real-space grid from the host to the GPU.
 317  *
 318  * \param[in] pmeGPU   The PME GPU structure.
 319  * \param[in] h_grid   The host-side grid buffer.
 320  */
 321 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 322                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 323
 324 /*! \libinternal \brief
 325  * Copies the output real-space grid from the GPU to the host.
 326  *
 327  * \param[in] pmeGPU   The PME GPU structure.
 328  * \param[out] h_grid  The host-side grid buffer.
 329  */
 330 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
 331                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 332
 333 /*! \libinternal \brief
 334  * Copies the spread output spline data and gridline indices from the GPU to the host.
 335  *
 336  * \param[in] pmeGPU   The PME GPU structure.
 337  */
 338 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 339
 340 /*! \libinternal \brief
 341  * Copies the gather input spline data and gridline indices from the host to the GPU.
 342  *
 343  * \param[in] pmeGPU   The PME GPU structure.
 344  */
 345 CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 346
 347 /*! \libinternal \brief
 348  * Waits for the grid copying to the host-side buffer after spreading to finish.
 349  *
 350  * \param[in] pmeGPU  The PME GPU structure.
 351  */
 352 CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 353
 354 /*! \libinternal \brief
 355  * Does the one-time GPU-framework specific PME initialization.
 356  * For CUDA, the PME stream is created with the highest priority.
 357  *
 358  * \param[in] pmeGPU  The PME GPU structure.
 359  */
 360 CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 361
 362 /*! \libinternal \brief
 363  * Destroys the PME GPU-framework specific data.
 364  * Should be called last in the PME GPU destructor.
 365  *
 366  * \param[in] pmeGPU  The PME GPU structure.
 367  */
 368 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 369
 370 /*! \libinternal \brief
 371  * Initializes the PME GPU synchronization events.
 372  *
 373  * \param[in] pmeGPU  The PME GPU structure.
 374  */
 375 CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 376
 377 /*! \libinternal \brief
 378  * Destroys the PME GPU synchronization events.
 379  *
 380  * \param[in] pmeGPU  The PME GPU structure.
 381  */
 382 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 383
 384 /*! \libinternal \brief
 385  * Initializes the CUDA FFT structures.
 386  *
 387  * \param[in] pmeGPU  The PME GPU structure.
 388  */
 389 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 390
 391 /*! \libinternal \brief
 392  * Destroys the CUDA FFT structures.
 393  *
 394  * \param[in] pmeGPU  The PME GPU structure.
 395  */
 396 CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 397
 398 /* Several CUDA event-based timing functions that live in pme-timings.cu */
 399
 400 /*! \libinternal \brief
 401  * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
 402  *
 403  * \param[in] pmeGPU         The PME GPU structure.
 404  */
 405 CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 406
 407 /*! \libinternal \brief
 408  * Updates the internal list of active PME GPU stages (if timings are enabled).
 409  *
 410  * \param[in] pmeGPU         The PME GPU data structure.
 411  */
 412 CUDA_FUNC_QUALIFIER void pme_gpu_reinit_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 413
 414 /*! \brief
 415  * Resets the PME GPU timings. To be called at the reset MD step.
 416  *
 417  * \param[in] pmeGPU         The PME GPU structure.
 418  */
 419 CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 420
 421 /*! \libinternal \brief
 422  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 423  *
 424  * \param[in] pmeGPU         The PME GPU structure.
 425  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 426  */
 427 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const PmeGpu            *CUDA_FUNC_ARGUMENT(pmeGPU),
 428                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 429
 430 /* The PME stages themselves */
 431
 432 /*! \libinternal \brief
 433  * A GPU spline computation and charge spreading function.
 434  *
 435  * \param[in]  pmeGpu          The PME GPU structure.
 436  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 437  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 438  *                             e.g. testing or host-side FFT)
 439  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 440  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 441  */
 442 CUDA_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 443                                         int              CUDA_FUNC_ARGUMENT(gridIndex),
 444                                         real            *CUDA_FUNC_ARGUMENT(h_grid),
 445                                         bool             CUDA_FUNC_ARGUMENT(computeSplines),
 446                                         bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
 447
 448 /*! \libinternal \brief
 449  * 3D FFT R2C/C2R routine.
 450  *
 451  * \param[in]  pmeGpu          The PME GPU structure.
 452  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 453  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 454  */
 455 CUDA_FUNC_QUALIFIER void pme_gpu_3dfft(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
 456                                        enum gmx_fft_direction CUDA_FUNC_ARGUMENT(direction),
 457                                        const int              CUDA_FUNC_ARGUMENT(gridIndex)) CUDA_FUNC_TERM
 458
 459 /*! \libinternal \brief
 460  * A GPU Fourier space solving function.
 461  *
 462  * \param[in]     pmeGpu                  The PME GPU structure.
 463  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 464  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 465  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should also be performed.
 466  */
 467 CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGpu),
 468                                        t_complex       *CUDA_FUNC_ARGUMENT(h_grid),
 469                                        GridOrdering     CUDA_FUNC_ARGUMENT(gridOrdering),
 470                                        bool             CUDA_FUNC_ARGUMENT(computeEnergyAndVirial)) CUDA_FUNC_TERM
 471
 472 /*! \libinternal \brief
 473  * A GPU force gathering function.
 474  *
 475  * \param[in]     pmeGpu           The PME GPU structure.
 476  * \param[in,out] h_forces         The host buffer with input and output forces.
 477  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
 478  *                                 TODO: determine efficiency/balance of host/device-side reductions.
 479  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
 480  */
 481 CUDA_FUNC_QUALIFIER void pme_gpu_gather(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
 482                                         float                 *CUDA_FUNC_ARGUMENT(h_forces),
 483                                         PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
 484                                         const float           *CUDA_FUNC_ARGUMENT(h_grid)
 485                                         ) CUDA_FUNC_TERM
 486
 487
 488 /* The inlined convenience PME GPU status getters */
 489
 490 /*! \libinternal \brief
 491  * Tells if PME runs on multiple GPUs with the decomposition.
 492  *
 493  * \param[in] pmeGPU         The PME GPU structure.
 494  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 495  */
 496 gmx_inline bool pme_gpu_uses_dd(const PmeGpu *pmeGPU)
 497 {
 498     return !pmeGPU->settings.useDecomposition;
 499 }
 500
 501 /*! \libinternal \brief
 502  * Tells if PME performs the gathering stage on GPU.
 503  *
 504  * \param[in] pmeGPU         The PME GPU structure.
 505  * \returns                  True if the gathering is performed on GPU, false otherwise.
 506  */
 507 gmx_inline bool pme_gpu_performs_gather(const PmeGpu *pmeGPU)
 508 {
 509     return pmeGPU->settings.performGPUGather;
 510 }
 511
 512 /*! \libinternal \brief
 513  * Tells if PME performs the FFT stages on GPU.
 514  *
 515  * \param[in] pmeGPU         The PME GPU structure.
 516  * \returns                  True if FFT is performed on GPU, false otherwise.
 517  */
 518 gmx_inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGPU)
 519 {
 520     return pmeGPU->settings.performGPUFFT;
 521 }
 522
 523 /*! \libinternal \brief
 524  * Tells if PME performs the grid (un-)wrapping on GPU.
 525  *
 526  * \param[in] pmeGPU         The PME GPU structure.
 527  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 528  */
 529 gmx_inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGPU)
 530 {
 531     return pmeGPU->settings.useDecomposition;
 532 }
 533
 534 /*! \libinternal \brief
 535  * Tells if PME performs the grid solving on GPU.
 536  *
 537  * \param[in] pmeGPU         The PME GPU structure.
 538  * \returns                  True if solving is performed on GPU, false otherwise.
 539  */
 540 gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
 541 {
 542     return pmeGPU->settings.performGPUSolve;
 543 }
 544
 545 /*! \libinternal \brief
 546  * Enables or disables the testing mode.
 547  * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
 548  * and also makes the copies synchronous.
 549  *
 550  * \param[in] pmeGPU             The PME GPU structure.
 551  * \param[in] testing            Should the testing mode be enabled, or disabled.
 552  */
 553 gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing)
 554 {
 555     pmeGPU->settings.copyAllOutputs = testing;
 556     pmeGPU->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 557 }
 558
 559 /*! \libinternal \brief
 560  * Tells if PME is in the testing mode.
 561  *
 562  * \param[in] pmeGPU             The PME GPU structure.
 563  * \returns                      true if testing mode is enabled, false otherwise.
 564  */
 565 gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGPU)
 566 {
 567     return pmeGPU->settings.copyAllOutputs;
 568 }
 569
 570 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 571
 572 /*! \libinternal \brief
 573  * Returns the output virial and energy of the PME solving.
 574  * Should be called after pme_gpu_finish_computation.
 575  *
 576  * \param[in] pmeGPU             The PME GPU structure.
 577  * \param[out] energy            The output energy.
 578  * \param[out] virial            The output virial matrix.
 579  */
 580 void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial);
 581
 582 /*! \libinternal \brief
 583  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 584  *
 585  * \param[in] pmeGPU         The PME GPU structure.
 586  * \param[in] box            The unit cell box.
 587  */
 588 void pme_gpu_update_input_box(PmeGpu *pmeGPU, const matrix box);
 589
 590 /*! \libinternal \brief
 591  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 592  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 593  * If virial/energy were computed, they will have arrived into the internal staging buffer
 594  * (even though that should have already happened before even launching the gather).
 595  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 596  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 597  *
 598  * \param[in] pmeGPU         The PME GPU structure.
 599  */
 600 void pme_gpu_finish_computation(const PmeGpu *pmeGPU);
 601
 602 //! A binary enum for spline data layout transformation
 603 enum class PmeLayoutTransform
 604 {
 605     GpuToHost,
 606     HostToGpu
 607 };
 608
 609 /*! \libinternal \brief
 610  * Rearranges the atom spline data between the GPU and host layouts.
 611  * Only used for test purposes so far, likely to be horribly slow.
 612  *
 613  * \param[in]  pmeGPU     The PME GPU structure.
 614  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 615  * \param[in]  type       The spline data type (values or derivatives).
 616  * \param[in]  dimIndex   Dimension index.
 617  * \param[in]  transform  Layout transform type
 618  */
 619 void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGPU, const pme_atomcomm_t *atc,
 620                                         PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
 621
 622 /*! \libinternal \brief
 623  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 624  *
 625  * \param[in] pmeGPU             The PME GPU structure.
 626  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 627  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 628  */
 629 void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
 630
 631 /*! \libinternal \brief
 632  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 633  *
 634  * \param[in,out] pme       The PME structure.
 635  * \param[in,out] gpuInfo   The GPU information structure.
 636  * \param[in]     mdlog     The logger.
 637  * \param[in]     cr        The communication structure.
 638  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 639  */
 640 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
 641
 642 /*! \libinternal \brief
 643  * Destroys the PME GPU data at the end of the run.
 644  *
 645  * \param[in] pmeGPU     The PME GPU structure.
 646  */
 647 void pme_gpu_destroy(PmeGpu *pmeGPU);
 648
 649 /*! \libinternal \brief
 650  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 651  *
 652  * \param[in] pmeGPU    The PME GPU structure.
 653  * \param[in] nAtoms    The number of particles.
 654  * \param[in] charges   The pointer to the host-side array of particle charges.
 655  *
 656  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 657  * Should be called before the pme_gpu_set_io_ranges.
 658  */
 659 void pme_gpu_reinit_atoms(PmeGpu           *pmeGPU,
 660                           const int         nAtoms,
 661                           const real       *charges);
 662
 663 #endif