src/gromacs/ewald/pme_gpu_internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme_gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  51 #include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
  52
  53 #include "pme_gpu_types_host.h"
  54 #include "pme_output.h"
  55
  56 class DeviceContext;
  57 struct DeviceInformation;
  58 class DeviceStream;
  59 class GpuEventSynchronizer;
  60 struct gmx_hw_info_t;
  61 struct gmx_gpu_opt_t;
  62 struct gmx_pme_t; // only used in pme_gpu_reinit
  63 struct gmx_wallcycle;
  64 class PmeAtomComm;
  65 enum class PmeForceOutputHandling;
  66 struct PmeGpu;
  67 class PmeGpuProgram;
  68 struct PmeGpuStaging;
  69 struct PmeGpuSettings;
  70 struct t_complex;
  71
  72 #ifndef FEP_STATE_A
  73 //! Grid index of FEP state A (or unperturbed system)
  74 #    define FEP_STATE_A 0
  75 #endif
  76 #ifndef FEP_STATE_B
  77 //! Grid index of FEP state B
  78 #    define FEP_STATE_B 1
  79 #endif
  80
  81 namespace gmx
  82 {
  83 template<typename>
  84 class ArrayRef;
  85 class MDLogger;
  86 } // namespace gmx
  87
  88 //! Type of spline data
  89 enum class PmeSplineDataType
  90 {
  91     Values,      // theta
  92     Derivatives, // dtheta
  93 };               // TODO move this into new and shiny pme.h (pme-types.h?)
  94
  95 //! PME grid dimension ordering (from major to minor)
  96 enum class GridOrdering
  97 {
  98     YZX,
  99     XYZ
 100 };
 101
 102 /*! \libinternal \brief
 103  * Returns the size of the block size requirement
 104  *
 105  * The GPU version of PME requires that the coordinates array have a
 106  * size divisible by the returned number.
 107  *
 108  * \returns Number of atoms in a single GPU atom data chunk, which
 109  * determines a minimum divisior of the size of the memory allocated.
 110  */
 111 int pme_gpu_get_atom_data_block_size();
 112
 113 /*! \libinternal \brief
 114  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 115  *
 116  * \param[in] pmeGpu            The PME GPU structure.
 117  */
 118 GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 119
 120 /*! \libinternal \brief
 121  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 122  *
 123  * \param[in,out] pmeGpu            The PME GPU structure.
 124  */
 125 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu);
 126
 127 /*! \libinternal \brief
 128  * Frees the energy and virial memory both on GPU and CPU.
 129  *
 130  * \param[in] pmeGpu            The PME GPU structure.
 131  */
 132 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu);
 133
 134 /*! \libinternal \brief
 135  * Clears the energy and virial memory on GPU with 0.
 136  * Should be called at the end of PME computation which returned energy/virial.
 137  *
 138  * \param[in] pmeGpu            The PME GPU structure.
 139  */
 140 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu);
 141
 142 /*! \libinternal \brief
 143  * Reallocates and copies the pre-computed B-spline values to the GPU.
 144  *
 145  * \param[in,out] pmeGpu             The PME GPU structure.
 146  * \param[in]     gridIndex          The index of the grid to use. 0 is Coulomb in the normal
 147  *                                   state or FEP state A and 1 is Coulomb in FEP state B.
 148  */
 149 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0);
 150
 151 /*! \libinternal \brief
 152  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 153  *
 154  * \param[in] pmeGpu             The PME GPU structure.
 155  */
 156 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu);
 157
 158 /*! \libinternal \brief
 159  * Reallocates the GPU buffer for the PME forces.
 160  *
 161  * \param[in] pmeGpu             The PME GPU structure.
 162  */
 163 void pme_gpu_realloc_forces(PmeGpu* pmeGpu);
 164
 165 /*! \libinternal \brief
 166  * Frees the GPU buffer for the PME forces.
 167  *
 168  * \param[in] pmeGpu             The PME GPU structure.
 169  */
 170 void pme_gpu_free_forces(const PmeGpu* pmeGpu);
 171
 172 /*! \libinternal \brief
 173  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered
 174  * forces). To be called e.g. after the bonded calculations.
 175  *
 176  * \param[in] pmeGpu             The PME GPU structure.
 177  */
 178 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu);
 179
 180 /*! \libinternal \brief
 181  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 182  *
 183  * \param[in] pmeGpu             The PME GPU structure.
 184  */
 185 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu);
 186
 187 /*! \libinternal \brief
 188  * Checks whether work in the PME GPU stream has completed.
 189  *
 190  * \param[in] pmeGpu            The PME GPU structure.
 191  *
 192  * \returns                     True if work in the PME stream has completed.
 193  */
 194 bool pme_gpu_stream_query(const PmeGpu* pmeGpu);
 195
 196 /*! \libinternal \brief
 197  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 198  * Clears the padded part if needed.
 199  *
 200  * \param[in] pmeGpu            The PME GPU structure.
 201  * \param[in] h_coefficients    The input atom charges/coefficients.
 202  * \param[in] gridIndex         The index of the grid to use. 0 is Coulomb in the normal
 203  *                              state or FEP state A and 1 is Coulomb in FEP state B.
 204  *
 205  * Does not need to be done for every PME computation, only whenever the local charges change.
 206  * (So, in the beginning of the run, or on DD step).
 207  */
 208 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
 209                                                  const float*  h_coefficients,
 210                                                  int           gridIndex = 0);
 211
 212 /*! \libinternal \brief
 213  * Frees the charges/coefficients on the GPU.
 214  *
 215  * \param[in] pmeGpu             The PME GPU structure.
 216  */
 217 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu);
 218
 219 /*! \libinternal \brief
 220  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 221  *
 222  * \param[in,out] pmeGpu            The PME GPU structure.
 223  */
 224 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu);
 225
 226 /*! \libinternal \brief
 227  * Frees the buffers on the GPU for the atoms spline data.
 228  *
 229  * \param[in] pmeGpu            The PME GPU structure.
 230  */
 231 void pme_gpu_free_spline_data(const PmeGpu* pmeGpu);
 232
 233 /*! \libinternal \brief
 234  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 235  *
 236  * \param[in,out] pmeGpu            The PME GPU structure.
 237  */
 238 void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu);
 239
 240 /*! \libinternal \brief
 241  * Frees the buffer on the GPU for the particle gridline indices.
 242  *
 243  * \param[in] pmeGpu            The PME GPU structure.
 244  */
 245 void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu);
 246
 247 /*! \libinternal \brief
 248  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 249  *
 250  * \param[in] pmeGpu            The PME GPU structure.
 251  */
 252 void pme_gpu_realloc_grids(PmeGpu* pmeGpu);
 253
 254 /*! \libinternal \brief
 255  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 256  *
 257  * \param[in] pmeGpu            The PME GPU structure.
 258  */
 259 void pme_gpu_free_grids(const PmeGpu* pmeGpu);
 260
 261 /*! \libinternal \brief
 262  * Clears the real space grid on the GPU.
 263  * Should be called at the end of each computation.
 264  *
 265  * \param[in] pmeGpu            The PME GPU structure.
 266  */
 267 void pme_gpu_clear_grids(const PmeGpu* pmeGpu);
 268
 269 /*! \libinternal \brief
 270  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 271  *
 272  * \param[in] pmeGpu            The PME GPU structure.
 273  */
 274 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu);
 275
 276 /*! \libinternal \brief
 277  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 278  *
 279  * \param[in] pmeGpu            The PME GPU structure.
 280  */
 281 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu);
 282
 283 /*! \libinternal \brief
 284  * Copies the input real-space grid from the host to the GPU.
 285  *
 286  * \param[in] pmeGpu    The PME GPU structure.
 287  * \param[in] h_grid    The host-side grid buffer.
 288  * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
 289  *                      state or FEP state A and 1 is Coulomb in FEP state B.
 290  */
 291 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0);
 292
 293 /*! \libinternal \brief
 294  * Copies the output real-space grid from the GPU to the host.
 295  *
 296  * \param[in] pmeGpu    The PME GPU structure.
 297  * \param[out] h_grid   The host-side grid buffer.
 298  * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
 299  *                      state or FEP state A and 1 is Coulomb in FEP state B.
 300  */
 301 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0);
 302
 303 /*! \libinternal \brief
 304  * Copies the spread output spline data and gridline indices from the GPU to the host.
 305  *
 306  * \param[in] pmeGpu    The PME GPU structure.
 307  */
 308 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
 309
 310 /*! \libinternal \brief
 311  * Copies the gather input spline data and gridline indices from the host to the GPU.
 312  *
 313  * \param[in] pmeGpu    The PME GPU structure.
 314  */
 315 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
 316
 317 /*! \libinternal \brief
 318  * Waits for the grid copying to the host-side buffer after spreading to finish.
 319  *
 320  * \param[in] pmeGpu  The PME GPU structure.
 321  */
 322 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
 323
 324 /*! \libinternal \brief
 325  * Initializes the CUDA FFT structures.
 326  *
 327  * \param[in] pmeGpu  The PME GPU structure.
 328  */
 329 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu);
 330
 331 /*! \libinternal \brief
 332  * Destroys the CUDA FFT structures.
 333  *
 334  * \param[in] pmeGpu  The PME GPU structure.
 335  */
 336 void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
 337
 338 /* The PME stages themselves */
 339
 340 /*! \libinternal \brief
 341  * A GPU spline computation and charge spreading function.
 342  *
 343  * \param[in]  pmeGpu                    The PME GPU structure.
 344  * \param[in]  xReadyOnDevice            Event synchronizer indicating that the coordinates are
 345  *                                       ready in the device memory; can be nullptr when invoked
 346  *                                       on a separate PME rank or from PME tests.
 347  * \param[out] h_grids                   The host-side grid buffers (used only if the result
 348  *                                       of the spread is expected on the host, e.g. testing
 349  *                                       or host-side FFT)
 350  * \param[in]  computeSplines            Should the computation of spline parameters and gridline
 351  *                                       indices be performed.
 352  * \param[in]  spreadCharges             Should the charges/coefficients be spread on the grid.
 353  * \param[in]  lambda                    The lambda value of the current system state.
 354  * \param[in]  useGpuDirectComm          Whether direct GPU PME-PP communication is active
 355  * \param[in]  pmeCoordinateReceiverGpu  Coordinate receiver object, which must be valid when
 356  *                                       direct GPU PME-PP communication is active
 357  */
 358 GPU_FUNC_QUALIFIER void
 359 pme_gpu_spread(const PmeGpu*                  GPU_FUNC_ARGUMENT(pmeGpu),
 360                GpuEventSynchronizer*          GPU_FUNC_ARGUMENT(xReadyOnDevice),
 361                float**                        GPU_FUNC_ARGUMENT(h_grids),
 362                bool                           GPU_FUNC_ARGUMENT(computeSplines),
 363                bool                           GPU_FUNC_ARGUMENT(spreadCharges),
 364                real                           GPU_FUNC_ARGUMENT(lambda),
 365                bool                           GPU_FUNC_ARGUMENT(useGpuDirectComm),
 366                gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
 367
 368 /*! \libinternal \brief
 369  * 3D FFT R2C/C2R routine.
 370  *
 371  * \param[in]  pmeGpu          The PME GPU structure.
 372  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 373  * \param[in]  gridIndex       The index of the grid to use. 0 is Coulomb in the normal
 374  *                             state or FEP state A and 1 is Coulomb in FEP state B.
 375  */
 376 void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0);
 377
 378 /*! \libinternal \brief
 379  * A GPU Fourier space solving function.
 380  *
 381  * \param[in]     pmeGpu                  The PME GPU structure.
 382  * \param[in]     gridIndex               The index of the grid to use. 0 is Coulomb in the normal
 383  *                                        state or FEP state A and 1 is Coulomb in FEP state B.
 384  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 385  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 386  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should be performed.
 387  */
 388 GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 389                                       int           GPU_FUNC_ARGUMENT(gridIndex),
 390                                       t_complex*    GPU_FUNC_ARGUMENT(h_grid),
 391                                       GridOrdering  GPU_FUNC_ARGUMENT(gridOrdering),
 392                                       bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
 393
 394 /*! \libinternal \brief
 395  * A GPU force gathering function.
 396  *
 397  * \param[in]     pmeGpu                   The PME GPU structure.
 398  * \param[in]     h_grids                  The host-side grid buffer (used only in testing mode).
 399  * \param[in]     lambda                   The lambda value to use.
 400  */
 401 GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 402                                        float** GPU_FUNC_ARGUMENT(h_grids),
 403                                        float   GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
 404
 405
 406 /*! \brief Sets the device pointer to coordinate data
 407  * \param[in] pmeGpu         The PME GPU structure.
 408  * \param[in] d_x            Pointer to coordinate data
 409  */
 410 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 411                                                             DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 412
 413 /*! \brief Return pointer to device copy of force data.
 414  * \param[in] pmeGpu         The PME GPU structure.
 415  * \returns                  Pointer to force data
 416  */
 417 GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
 418         GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
 419
 420 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
 421  * \param[in] pmeGpu         The PME GPU structure.
 422  * \returns                  Pointer to sync object
 423  */
 424 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(
 425         const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 426
 427 /*! \libinternal \brief
 428  * Returns the PME GPU settings
 429  *
 430  * \param[in] pmeGpu         The PME GPU structure.
 431  * \returns                  The settings for PME on GPU
 432  */
 433 inline const PmeGpuSettings& pme_gpu_settings(const PmeGpu* pmeGpu)
 434 {
 435     return pmeGpu->settings;
 436 }
 437
 438 /*! \libinternal \brief
 439  * Returns the PME GPU staging object
 440  *
 441  * \param[in] pmeGpu         The PME GPU structure.
 442  * \returns                  The staging object for PME on GPU
 443  */
 444 inline const PmeGpuStaging& pme_gpu_staging(const PmeGpu* pmeGpu)
 445 {
 446     return pmeGpu->staging;
 447 }
 448
 449 /*! \libinternal \brief
 450  * Sets whether the PME module is running in testing mode
 451  *
 452  * \param[in] pmeGpu         The PME GPU structure.
 453  * \param[in] testing        Whether testing mode is on.
 454  */
 455 inline void pme_gpu_set_testing(PmeGpu* pmeGpu, bool testing)
 456 {
 457     if (pmeGpu)
 458     {
 459         pmeGpu->settings.copyAllOutputs = testing;
 460         pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 461     }
 462 }
 463
 464 /* A block of C++ functions that live in pme_gpu_internal.cpp */
 465
 466 /*! \libinternal \brief
 467  * Returns the energy and virial GPU outputs, useful for testing.
 468  *
 469  * It is the caller's responsibility to be aware of whether the GPU
 470  * handled the solve stage.
 471  *
 472  * \param[in] pme                The PME structure.
 473  * \param[in] lambda             The lambda value to use when calculating the results.
 474  * \param[out] output            Pointer to output where energy and virial should be stored.
 475  */
 476 GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
 477                                                    float            GPU_FUNC_ARGUMENT(lambda),
 478                                                    PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
 479
 480 /*! \libinternal \brief
 481  * Returns the GPU outputs (forces, energy and virial)
 482  *
 483  * \param[in] pme                     The PME structure.
 484  * \param[in] computeEnergyAndVirial  Whether the energy and virial are being computed
 485  * \param[in] lambdaQ            The Coulomb lambda to use when finalizing the output.
 486  * \returns                           The output object.
 487  */
 488 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
 489                                                bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
 490                                                real GPU_FUNC_ARGUMENT(lambdaQ))
 491         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 492
 493 /*! \libinternal \brief
 494  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 495  *
 496  * \param[in] pmeGpu         The PME GPU structure.
 497  * \param[in] box            The unit cell box.
 498  */
 499 GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu*      GPU_FUNC_ARGUMENT(pmeGpu),
 500                                                  const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM;
 501
 502 /*! \libinternal \brief
 503  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 504  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 505  * If virial/energy were computed, they will have arrived into the internal staging buffer
 506  * (even though that should have already happened before even launching the gather).
 507  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 508  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 509  *
 510  * \param[in] pmeGpu         The PME GPU structure.
 511  */
 512 void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
 513
 514 /*! \libinternal \brief
 515  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 516  *
 517  * \param[in] pmeGpu             The PME GPU structure.
 518  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 519  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 520  */
 521 GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
 522                                                     gmx::IVec*    GPU_FUNC_ARGUMENT(gridSize),
 523                                                     gmx::IVec* GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM;
 524
 525 /*! \libinternal \brief
 526  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 527  *
 528  * \param[in,out] pme            The PME structure.
 529  * \param[in]     deviceContext  The GPU context.
 530  * \param[in]     deviceStream   The GPU stream.
 531  * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
 532  *
 533  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 534  */
 535 GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*           GPU_FUNC_ARGUMENT(pme),
 536                                        const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
 537                                        const DeviceStream*  GPU_FUNC_ARGUMENT(deviceStream),
 538                                        const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
 539
 540 /*! \libinternal \brief
 541  * Destroys the PME GPU data at the end of the run.
 542  *
 543  * \param[in] pmeGpu     The PME GPU structure.
 544  */
 545 GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 546
 547 /*! \libinternal \brief
 548  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 549  *
 550  * \param[in] pmeGpu    The PME GPU structure.
 551  * \param[in] nAtoms    The number of particles.
 552  * \param[in] chargesA  The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
 553  * \param[in] chargesB  The pointer to the host-side array of particle charges in FEP state B.
 554  *
 555  * This is a function that should only be called in the beginning of the run and on domain
 556  * decomposition. Should be called before the pme_gpu_set_io_ranges.
 557  */
 558 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu*     GPU_FUNC_ARGUMENT(pmeGpu),
 559                                              int         GPU_FUNC_ARGUMENT(nAtoms),
 560                                              const real* GPU_FUNC_ARGUMENT(chargesA),
 561                                              const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM;
 562
 563 /*! \brief \libinternal
 564  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
 565  *
 566  * This clears the device-side working buffers in preparation for new computation.
 567  *
 568  * \param[in] pmeGpu            The PME GPU structure.
 569  */
 570 void pme_gpu_reinit_computation(const PmeGpu* pmeGpu);
 571
 572 /*! \brief
 573  * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
 574  * (if they were to be computed).
 575  *
 576  * \param[in]  pme                     The PME data structure.
 577  * \param[in]  computeEnergyAndVirial  Tells if the energy and virial computation should be performed.
 578  * \param[in]  lambdaQ                 The Coulomb lambda to use when calculating the results.
 579  * \param[out] wcycle                  The wallclock counter.
 580  * \return                             The output forces, energy and virial
 581  */
 582 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
 583                                                       bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
 584                                                       real           GPU_FUNC_ARGUMENT(lambdaQ),
 585                                                       gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
 586         GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
 587
 588 #endif