src/gromacs/ewald/pme_gpu_internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme_gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  51 #include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
  52 #include "gromacs/utility/arrayref.h"
  53
  54 #include "pme_gpu_types_host.h"
  55 #include "pme_output.h"
  56
  57 class GpuEventSynchronizer;
  58 struct DeviceInformation;
  59 struct gmx_hw_info_t;
  60 struct gmx_gpu_opt_t;
  61 struct gmx_pme_t; // only used in pme_gpu_reinit
  62 struct gmx_wallcycle;
  63 class PmeAtomComm;
  64 enum class PmeForceOutputHandling;
  65 struct PmeGpu;
  66 class PmeGpuProgram;
  67 struct PmeGpuStaging;
  68 struct PmeGpuSettings;
  69 struct t_complex;
  70
  71 namespace gmx
  72 {
  73 class MDLogger;
  74 }
  75
  76 //! Type of spline data
  77 enum class PmeSplineDataType
  78 {
  79     Values,      // theta
  80     Derivatives, // dtheta
  81 };               // TODO move this into new and shiny pme.h (pme-types.h?)
  82
  83 //! PME grid dimension ordering (from major to minor)
  84 enum class GridOrdering
  85 {
  86     YZX,
  87     XYZ
  88 };
  89
  90 /*! \libinternal \brief
  91  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
  92  * Depends on CUDA-specific block sizes, needed for the atom data padding.
  93  *
  94  * \param[in] pmeGpu            The PME GPU structure.
  95  * \returns   Number of atoms in a single GPU atom data chunk.
  96  */
  97 int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
  98
  99 /*! \libinternal \brief
 100  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
 101  *
 102  * \param[in] pmeGpu            The PME GPU structure.
 103  * \returns   Number of atoms in a single GPU atom spline data chunk.
 104  */
 105 int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu);
 106
 107 /*! \libinternal \brief
 108  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 109  *
 110  * \param[in] pmeGpu            The PME GPU structure.
 111  */
 112 GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 113
 114 /*! \libinternal \brief
 115  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 116  *
 117  * \param[in,out] pmeGpu            The PME GPU structure.
 118  */
 119 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu);
 120
 121 /*! \libinternal \brief
 122  * Frees the energy and virial memory both on GPU and CPU.
 123  *
 124  * \param[in] pmeGpu            The PME GPU structure.
 125  */
 126 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu);
 127
 128 /*! \libinternal \brief
 129  * Clears the energy and virial memory on GPU with 0.
 130  * Should be called at the end of PME computation which returned energy/virial.
 131  *
 132  * \param[in] pmeGpu            The PME GPU structure.
 133  */
 134 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu);
 135
 136 /*! \libinternal \brief
 137  * Reallocates and copies the pre-computed B-spline values to the GPU.
 138  *
 139  * \param[in,out] pmeGpu             The PME GPU structure.
 140  */
 141 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu);
 142
 143 /*! \libinternal \brief
 144  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 145  *
 146  * \param[in] pmeGpu             The PME GPU structure.
 147  */
 148 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu);
 149
 150 /*! \libinternal \brief
 151  * Reallocates the GPU buffer for the PME forces.
 152  *
 153  * \param[in] pmeGpu             The PME GPU structure.
 154  */
 155 void pme_gpu_realloc_forces(PmeGpu* pmeGpu);
 156
 157 /*! \libinternal \brief
 158  * Frees the GPU buffer for the PME forces.
 159  *
 160  * \param[in] pmeGpu             The PME GPU structure.
 161  */
 162 void pme_gpu_free_forces(const PmeGpu* pmeGpu);
 163
 164 /*! \libinternal \brief
 165  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered
 166  * forces). To be called e.g. after the bonded calculations.
 167  *
 168  * \param[in] pmeGpu             The PME GPU structure.
 169  */
 170 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu);
 171
 172 /*! \libinternal \brief
 173  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 174  *
 175  * \param[in] pmeGpu             The PME GPU structure.
 176  */
 177 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu);
 178
 179 /*! \libinternal \brief
 180  * Checks whether work in the PME GPU stream has completed.
 181  *
 182  * \param[in] pmeGpu            The PME GPU structure.
 183  *
 184  * \returns                     True if work in the PME stream has completed.
 185  */
 186 bool pme_gpu_stream_query(const PmeGpu* pmeGpu);
 187
 188 /*! \libinternal \brief
 189  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 190  * Clears the padded part if needed.
 191  *
 192  * \param[in] pmeGpu            The PME GPU structure.
 193  * \param[in] h_coefficients    The input atom charges/coefficients.
 194  *
 195  * Does not need to be done for every PME computation, only whenever the local charges change.
 196  * (So, in the beginning of the run, or on DD step).
 197  */
 198 void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients);
 199
 200 /*! \libinternal \brief
 201  * Frees the charges/coefficients on the GPU.
 202  *
 203  * \param[in] pmeGpu             The PME GPU structure.
 204  */
 205 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu);
 206
 207 /*! \libinternal \brief
 208  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 209  *
 210  * \param[in,out] pmeGpu            The PME GPU structure.
 211  */
 212 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu);
 213
 214 /*! \libinternal \brief
 215  * Frees the buffers on the GPU for the atoms spline data.
 216  *
 217  * \param[in] pmeGpu            The PME GPU structure.
 218  */
 219 void pme_gpu_free_spline_data(const PmeGpu* pmeGpu);
 220
 221 /*! \libinternal \brief
 222  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 223  *
 224  * \param[in,out] pmeGpu            The PME GPU structure.
 225  */
 226 void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu);
 227
 228 /*! \libinternal \brief
 229  * Frees the buffer on the GPU for the particle gridline indices.
 230  *
 231  * \param[in] pmeGpu            The PME GPU structure.
 232  */
 233 void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu);
 234
 235 /*! \libinternal \brief
 236  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 237  *
 238  * \param[in] pmeGpu            The PME GPU structure.
 239  */
 240 void pme_gpu_realloc_grids(PmeGpu* pmeGpu);
 241
 242 /*! \libinternal \brief
 243  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 244  *
 245  * \param[in] pmeGpu            The PME GPU structure.
 246  */
 247 void pme_gpu_free_grids(const PmeGpu* pmeGpu);
 248
 249 /*! \libinternal \brief
 250  * Clears the real space grid on the GPU.
 251  * Should be called at the end of each computation.
 252  *
 253  * \param[in] pmeGpu            The PME GPU structure.
 254  */
 255 void pme_gpu_clear_grids(const PmeGpu* pmeGpu);
 256
 257 /*! \libinternal \brief
 258  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 259  *
 260  * \param[in] pmeGpu            The PME GPU structure.
 261  */
 262 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu);
 263
 264 /*! \libinternal \brief
 265  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 266  *
 267  * \param[in] pmeGpu            The PME GPU structure.
 268  */
 269 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu);
 270
 271 /*! \libinternal \brief
 272  * Copies the input real-space grid from the host to the GPU.
 273  *
 274  * \param[in] pmeGpu   The PME GPU structure.
 275  * \param[in] h_grid   The host-side grid buffer.
 276  */
 277 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid);
 278
 279 /*! \libinternal \brief
 280  * Copies the output real-space grid from the GPU to the host.
 281  *
 282  * \param[in] pmeGpu   The PME GPU structure.
 283  * \param[out] h_grid  The host-side grid buffer.
 284  */
 285 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid);
 286
 287 /*! \libinternal \brief
 288  * Copies the spread output spline data and gridline indices from the GPU to the host.
 289  *
 290  * \param[in] pmeGpu   The PME GPU structure.
 291  */
 292 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
 293
 294 /*! \libinternal \brief
 295  * Copies the gather input spline data and gridline indices from the host to the GPU.
 296  *
 297  * \param[in] pmeGpu   The PME GPU structure.
 298  */
 299 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
 300
 301 /*! \libinternal \brief
 302  * Waits for the grid copying to the host-side buffer after spreading to finish.
 303  *
 304  * \param[in] pmeGpu  The PME GPU structure.
 305  */
 306 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
 307
 308 /*! \libinternal \brief
 309  * Does the one-time GPU-framework specific PME initialization.
 310  * For CUDA, the PME stream is created with the highest priority.
 311  *
 312  * \param[in] pmeGpu  The PME GPU structure.
 313  */
 314 void pme_gpu_init_internal(PmeGpu* pmeGpu);
 315
 316 /*! \libinternal \brief
 317  * Destroys the PME GPU-framework specific data.
 318  * Should be called last in the PME GPU destructor.
 319  *
 320  * \param[in] pmeGpu  The PME GPU structure.
 321  */
 322 void pme_gpu_destroy_specific(const PmeGpu* pmeGpu);
 323
 324 /*! \libinternal \brief
 325  * Initializes the CUDA FFT structures.
 326  *
 327  * \param[in] pmeGpu  The PME GPU structure.
 328  */
 329 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu);
 330
 331 /*! \libinternal \brief
 332  * Destroys the CUDA FFT structures.
 333  *
 334  * \param[in] pmeGpu  The PME GPU structure.
 335  */
 336 void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
 337
 338 /* The PME stages themselves */
 339
 340 /*! \libinternal \brief
 341  * A GPU spline computation and charge spreading function.
 342  *
 343  * \param[in]  pmeGpu          The PME GPU structure.
 344  * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
 345  *                             can be nullptr when invoked on a separate PME rank or from PME tests.
 346  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 347  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 348  *                             e.g. testing or host-side FFT)
 349  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 350  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 351  */
 352 GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu*         GPU_FUNC_ARGUMENT(pmeGpu),
 353                                        GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
 354                                        int                   GPU_FUNC_ARGUMENT(gridIndex),
 355                                        real*                 GPU_FUNC_ARGUMENT(h_grid),
 356                                        bool                  GPU_FUNC_ARGUMENT(computeSplines),
 357                                        bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
 358
 359 /*! \libinternal \brief
 360  * 3D FFT R2C/C2R routine.
 361  *
 362  * \param[in]  pmeGpu          The PME GPU structure.
 363  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 364  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 365  */
 366 void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex);
 367
 368 /*! \libinternal \brief
 369  * A GPU Fourier space solving function.
 370  *
 371  * \param[in]     pmeGpu                  The PME GPU structure.
 372  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 373  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 374  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should also be performed.
 375  */
 376 GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 377                                       t_complex*    GPU_FUNC_ARGUMENT(h_grid),
 378                                       GridOrdering  GPU_FUNC_ARGUMENT(gridOrdering),
 379                                       bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
 380
 381 /*! \libinternal \brief
 382  * A GPU force gathering function.
 383  *
 384  * \param[in]     pmeGpu           The PME GPU structure.
 385  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
 386  *                                 TODO: determine efficiency/balance of host/device-side
 387  * reductions. \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
 388  */
 389 GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu*                GPU_FUNC_ARGUMENT(pmeGpu),
 390                                        PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
 391                                        const float* GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
 392
 393 /*! \brief Sets the device pointer to coordinate data
 394  * \param[in] pmeGpu         The PME GPU structure.
 395  * \param[in] d_x            Pointer to coordinate data
 396  */
 397 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 398                                                             DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 399
 400 /*! \brief Return pointer to device copy of force data.
 401  * \param[in] pmeGpu         The PME GPU structure.
 402  * \returns                  Pointer to force data
 403  */
 404 GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
 405         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 406
 407 /*! \brief Return pointer to GPU stream.
 408  * \param[in] pmeGpu         The PME GPU structure.
 409  * \returns                  Pointer to stream object.
 410  */
 411 GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
 412         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 413
 414 /*! \brief Return pointer to GPU context (for OpenCL builds).
 415  * \param[in] pmeGpu         The PME GPU structure.
 416  * \returns                  Pointer to context object.
 417  */
 418 GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
 419         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 420
 421 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
 422  * \param[in] pmeGpu         The PME GPU structure.
 423  * \returns                  Pointer to sync object
 424  */
 425 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(
 426         const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 427
 428 /*! \libinternal \brief
 429  * Returns the PME GPU settings
 430  *
 431  * \param[in] pmeGpu         The PME GPU structure.
 432  * \returns                  The settings for PME on GPU
 433  */
 434 inline const PmeGpuSettings& pme_gpu_settings(const PmeGpu* pmeGpu)
 435 {
 436     return pmeGpu->settings;
 437 }
 438
 439 /*! \libinternal \brief
 440  * Returns the PME GPU staging object
 441  *
 442  * \param[in] pmeGpu         The PME GPU structure.
 443  * \returns                  The staging object for PME on GPU
 444  */
 445 inline const PmeGpuStaging& pme_gpu_staging(const PmeGpu* pmeGpu)
 446 {
 447     return pmeGpu->staging;
 448 }
 449
 450 /*! \libinternal \brief
 451  * Sets whether the PME module is running in testing mode
 452  *
 453  * \param[in] pmeGpu         The PME GPU structure.
 454  * \param[in] testing        Whether testing mode is on.
 455  */
 456 inline void pme_gpu_set_testing(PmeGpu* pmeGpu, bool testing)
 457 {
 458     if (pmeGpu)
 459     {
 460         pmeGpu->settings.copyAllOutputs = testing;
 461         pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 462     }
 463 }
 464
 465 /* A block of C++ functions that live in pme_gpu_internal.cpp */
 466
 467 /*! \libinternal \brief
 468  * Returns the energy and virial GPU outputs, useful for testing.
 469  *
 470  * It is the caller's responsibility to be aware of whether the GPU
 471  * handled the solve stage.
 472  *
 473  * \param[in] pme                The PME structure.
 474  * \param[out] output            Pointer to output where energy and virial should be stored.
 475  */
 476 GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
 477                                                    PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
 478
 479 /*! \libinternal \brief
 480  * Returns the GPU outputs (forces, energy and virial)
 481  *
 482  * \param[in] pme                The PME structure.
 483  * \param[in] flags              The combination of flags that affected this PME computation.
 484  *                               The flags are the GMX_PME_ flags from pme.h.
 485  * \returns                      The output object.
 486  */
 487 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
 488                                                int              GPU_FUNC_ARGUMENT(flags))
 489         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 490
 491 /*! \libinternal \brief
 492  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 493  *
 494  * \param[in] pmeGpu         The PME GPU structure.
 495  * \param[in] box            The unit cell box.
 496  */
 497 GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu*      GPU_FUNC_ARGUMENT(pmeGpu),
 498                                                  const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM;
 499
 500 /*! \libinternal \brief
 501  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 502  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 503  * If virial/energy were computed, they will have arrived into the internal staging buffer
 504  * (even though that should have already happened before even launching the gather).
 505  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 506  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 507  *
 508  * \param[in] pmeGpu         The PME GPU structure.
 509  */
 510 void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
 511
 512 //! A binary enum for spline data layout transformation
 513 enum class PmeLayoutTransform
 514 {
 515     GpuToHost,
 516     HostToGpu
 517 };
 518
 519 /*! \libinternal \brief
 520  * Rearranges the atom spline data between the GPU and host layouts.
 521  * Only used for test purposes so far, likely to be horribly slow.
 522  *
 523  * \param[in]  pmeGpu     The PME GPU structure.
 524  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 525  * \param[in]  type       The spline data type (values or derivatives).
 526  * \param[in]  dimIndex   Dimension index.
 527  * \param[in]  transform  Layout transform type
 528  */
 529 GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 530                                                            const PmeAtomComm* GPU_FUNC_ARGUMENT(atc),
 531                                                            PmeSplineDataType GPU_FUNC_ARGUMENT(type),
 532                                                            int GPU_FUNC_ARGUMENT(dimIndex),
 533                                                            PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM;
 534
 535 /*! \libinternal \brief
 536  * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
 537  * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
 538  * in range(0, atomsPerBlock * order * DIM).
 539  * This is a wrapper, only used in unit tests.
 540  * \param[in] order            PME order
 541  * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
 542  * \param[in] dimIndex         Dimension index (from 0 to 2)
 543  * \param[in] atomIndex        Atom index wrt the block.
 544  * \param[in] atomsPerWarp     Number of atoms processed by a warp.
 545  *
 546  * \returns Index into theta or dtheta array using GPU layout.
 547  */
 548 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp);
 549
 550 /*! \libinternal \brief
 551  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 552  *
 553  * \param[in] pmeGpu             The PME GPU structure.
 554  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 555  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 556  */
 557 GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 558                                                     gmx::IVec*    GPU_FUNC_ARGUMENT(gridSize),
 559                                                     gmx::IVec* GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM;
 560
 561 /*! \libinternal \brief
 562  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 563  *
 564  * \param[in,out] pme             The PME structure.
 565  * \param[in]     deviceInfo      The GPU device information structure.
 566  * \param[in]     pmeGpuProgram   The PME GPU program data
 567  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 568  */
 569 GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
 570                                        const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
 571                                        const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
 572
 573 /*! \libinternal \brief
 574  * Destroys the PME GPU data at the end of the run.
 575  *
 576  * \param[in] pmeGpu     The PME GPU structure.
 577  */
 578 GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 579
 580 /*! \libinternal \brief
 581  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 582  *
 583  * \param[in] pmeGpu    The PME GPU structure.
 584  * \param[in] nAtoms    The number of particles.
 585  * \param[in] charges   The pointer to the host-side array of particle charges.
 586  *
 587  * This is a function that should only be called in the beginning of the run and on domain
 588  * decomposition. Should be called before the pme_gpu_set_io_ranges.
 589  */
 590 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu*     GPU_FUNC_ARGUMENT(pmeGpu),
 591                                              int         GPU_FUNC_ARGUMENT(nAtoms),
 592                                              const real* GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM;
 593
 594 /*! \brief \libinternal
 595  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
 596  *
 597  * This clears the device-side working buffers in preparation for new computation.
 598  *
 599  * \param[in] pmeGpu            The PME GPU structure.
 600  */
 601 void pme_gpu_reinit_computation(const PmeGpu* pmeGpu);
 602
 603 /*! \brief
 604  * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
 605  * (if they were to be computed).
 606  *
 607  * \param[in]  pme            The PME data structure.
 608  * \param[in]  flags          The combination of flags to affect this PME computation.
 609  *                            The flags are the GMX_PME_ flags from pme.h.
 610  * \param[out] wcycle         The wallclock counter.
 611  * \return     The output forces, energy and virial
 612  */
 613 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t*     GPU_FUNC_ARGUMENT(pme),
 614                                                       int            GPU_FUNC_ARGUMENT(flags),
 615                                                       gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
 616         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 617
 618 #endif