src/gromacs/ewald/pme_gpu_internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function definitions for performing the PME calculations on GPU.
  39  * These are not meant to be exposed outside of the PME GPU code.
  40  * As of now, their bodies are still in the common pme_gpu.cpp files.
  41  *
  42  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  43  * \ingroup module_ewald
  44  */
  45
  46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
  47 #define GMX_EWALD_PME_GPU_INTERNAL_H
  48
  49 #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
  50 #include "gromacs/gpu_utils/gpu_macros.h"      // for the GPU_FUNC_ macros
  51 #include "gromacs/utility/arrayref.h"
  52
  53 #include "pme_gpu_types_host.h"                     // for the inline functions accessing PmeGpu members
  54
  55 struct gmx_hw_info_t;
  56 struct gmx_gpu_opt_t;
  57 struct gmx_pme_t;                              // only used in pme_gpu_reinit
  58 struct gmx_wallclock_gpu_pme_t;
  59 class PmeAtomComm;
  60 struct t_complex;
  61
  62 namespace gmx
  63 {
  64 class MDLogger;
  65 }
  66
  67 //! Type of spline data
  68 enum class PmeSplineDataType
  69 {
  70     Values,      // theta
  71     Derivatives, // dtheta
  72 };               //TODO move this into new and shiny pme.h (pme-types.h?)
  73
  74 //! PME grid dimension ordering (from major to minor)
  75 enum class GridOrdering
  76 {
  77     YZX,
  78     XYZ
  79 };
  80
  81 /*! \libinternal \brief
  82  * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
  83  * Depends on CUDA-specific block sizes, needed for the atom data padding.
  84  *
  85  * \param[in] pmeGpu            The PME GPU structure.
  86  * \returns   Number of atoms in a single GPU atom data chunk.
  87  */
  88 int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu);
  89
  90 /*! \libinternal \brief
  91  * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
  92  *
  93  * \param[in] pmeGpu            The PME GPU structure.
  94  * \returns   Number of atoms in a single GPU atom spline data chunk.
  95  */
  96 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu);
  97
  98 /*! \libinternal \brief
  99  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
 100  *
 101  * \param[in] pmeGpu            The PME GPU structure.
 102  */
 103 GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 104
 105 /*! \libinternal \brief
 106  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
 107  *
 108  * \param[in,out] pmeGpu            The PME GPU structure.
 109  */
 110 void pme_gpu_alloc_energy_virial(PmeGpu *pmeGpu);
 111
 112 /*! \libinternal \brief
 113  * Frees the energy and virial memory both on GPU and CPU.
 114  *
 115  * \param[in] pmeGpu            The PME GPU structure.
 116  */
 117 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu);
 118
 119 /*! \libinternal \brief
 120  * Clears the energy and virial memory on GPU with 0.
 121  * Should be called at the end of PME computation which returned energy/virial.
 122  *
 123  * \param[in] pmeGpu            The PME GPU structure.
 124  */
 125 void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu);
 126
 127 /*! \libinternal \brief
 128  * Reallocates and copies the pre-computed B-spline values to the GPU.
 129  *
 130  * \param[in,out] pmeGpu             The PME GPU structure.
 131  */
 132 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu *pmeGpu);
 133
 134 /*! \libinternal \brief
 135  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
 136  *
 137  * \param[in] pmeGpu             The PME GPU structure.
 138  */
 139 void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu);
 140
 141 /*! \libinternal \brief
 142  * Reallocates the GPU buffer for the PME forces.
 143  *
 144  * \param[in] pmeGpu             The PME GPU structure.
 145  */
 146 void pme_gpu_realloc_forces(PmeGpu *pmeGpu);
 147
 148 /*! \libinternal \brief
 149  * Frees the GPU buffer for the PME forces.
 150  *
 151  * \param[in] pmeGpu             The PME GPU structure.
 152  */
 153 void pme_gpu_free_forces(const PmeGpu *pmeGpu);
 154
 155 /*! \libinternal \brief
 156  * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
 157  * To be called e.g. after the bonded calculations.
 158  *
 159  * \param[in] pmeGpu             The PME GPU structure.
 160  */
 161 void pme_gpu_copy_input_forces(PmeGpu *pmeGpu);
 162
 163 /*! \libinternal \brief
 164  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
 165  *
 166  * \param[in] pmeGpu             The PME GPU structure.
 167  */
 168 void pme_gpu_copy_output_forces(PmeGpu *pmeGpu);
 169
 170 /*! \libinternal \brief
 171  * Checks whether work in the PME GPU stream has completed.
 172  *
 173  * \param[in] pmeGpu            The PME GPU structure.
 174  *
 175  * \returns                     True if work in the PME stream has completed.
 176  */
 177 bool pme_gpu_stream_query(const PmeGpu *pmeGpu);
 178
 179 /*! \libinternal \brief
 180  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
 181  *
 182  * \param[in] pmeGpu            The PME GPU structure.
 183  *
 184  * Needs to be called on every DD step/in the beginning.
 185  */
 186 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu);
 187
 188 /*! \libinternal \brief
 189  * Frees the coordinates on the GPU.
 190  *
 191  * \param[in] pmeGpu            The PME GPU structure.
 192  */
 193 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu);
 194
 195 /*! \libinternal \brief
 196  * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
 197  * Clears the padded part if needed.
 198  *
 199  * \param[in] pmeGpu            The PME GPU structure.
 200  * \param[in] h_coefficients    The input atom charges/coefficients.
 201  *
 202  * Does not need to be done for every PME computation, only whenever the local charges change.
 203  * (So, in the beginning of the run, or on DD step).
 204  */
 205 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu    *pmeGpu,
 206                                                  const float     *h_coefficients);
 207
 208 /*! \libinternal \brief
 209  * Frees the charges/coefficients on the GPU.
 210  *
 211  * \param[in] pmeGpu             The PME GPU structure.
 212  */
 213 void pme_gpu_free_coefficients(const PmeGpu *pmeGpu);
 214
 215 /*! \libinternal \brief
 216  * Reallocates the buffers on the GPU and the host for the atoms spline data.
 217  *
 218  * \param[in,out] pmeGpu            The PME GPU structure.
 219  */
 220 void pme_gpu_realloc_spline_data(PmeGpu *pmeGpu);
 221
 222 /*! \libinternal \brief
 223  * Frees the buffers on the GPU for the atoms spline data.
 224  *
 225  * \param[in] pmeGpu            The PME GPU structure.
 226  */
 227 void pme_gpu_free_spline_data(const PmeGpu *pmeGpu);
 228
 229 /*! \libinternal \brief
 230  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
 231  *
 232  * \param[in,out] pmeGpu            The PME GPU structure.
 233  */
 234 void pme_gpu_realloc_grid_indices(PmeGpu *pmeGpu);
 235
 236 /*! \libinternal \brief
 237  * Frees the buffer on the GPU for the particle gridline indices.
 238  *
 239  * \param[in] pmeGpu            The PME GPU structure.
 240  */
 241 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu);
 242
 243 /*! \libinternal \brief
 244  * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
 245  *
 246  * \param[in] pmeGpu            The PME GPU structure.
 247  */
 248 void pme_gpu_realloc_grids(PmeGpu *pmeGpu);
 249
 250 /*! \libinternal \brief
 251  * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
 252  *
 253  * \param[in] pmeGpu            The PME GPU structure.
 254  */
 255 void pme_gpu_free_grids(const PmeGpu *pmeGpu);
 256
 257 /*! \libinternal \brief
 258  * Clears the real space grid on the GPU.
 259  * Should be called at the end of each computation.
 260  *
 261  * \param[in] pmeGpu            The PME GPU structure.
 262  */
 263 void pme_gpu_clear_grids(const PmeGpu *pmeGpu);
 264
 265 /*! \libinternal \brief
 266  * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
 267  *
 268  * \param[in] pmeGpu            The PME GPU structure.
 269  */
 270 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu);
 271
 272 /*! \libinternal \brief
 273  * Frees the pre-computed fractional coordinates' shifts on the GPU.
 274  *
 275  * \param[in] pmeGpu            The PME GPU structure.
 276  */
 277 void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu);
 278
 279 /*! \libinternal \brief
 280  * Copies the input real-space grid from the host to the GPU.
 281  *
 282  * \param[in] pmeGpu   The PME GPU structure.
 283  * \param[in] h_grid   The host-side grid buffer.
 284  */
 285 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu,
 286                                     float        *h_grid);
 287
 288 /*! \libinternal \brief
 289  * Copies the output real-space grid from the GPU to the host.
 290  *
 291  * \param[in] pmeGpu   The PME GPU structure.
 292  * \param[out] h_grid  The host-side grid buffer.
 293  */
 294 void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu,
 295                                      float        *h_grid);
 296
 297 /*! \libinternal \brief
 298  * Copies the spread output spline data and gridline indices from the GPU to the host.
 299  *
 300  * \param[in] pmeGpu   The PME GPU structure.
 301  */
 302 void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu);
 303
 304 /*! \libinternal \brief
 305  * Copies the gather input spline data and gridline indices from the host to the GPU.
 306  *
 307  * \param[in] pmeGpu   The PME GPU structure.
 308  */
 309 void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu);
 310
 311 /*! \libinternal \brief
 312  * Waits for the grid copying to the host-side buffer after spreading to finish.
 313  *
 314  * \param[in] pmeGpu  The PME GPU structure.
 315  */
 316 void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu);
 317
 318 /*! \libinternal \brief
 319  * Does the one-time GPU-framework specific PME initialization.
 320  * For CUDA, the PME stream is created with the highest priority.
 321  *
 322  * \param[in] pmeGpu  The PME GPU structure.
 323  */
 324 void pme_gpu_init_internal(PmeGpu *pmeGpu);
 325
 326 /*! \libinternal \brief
 327  * Destroys the PME GPU-framework specific data.
 328  * Should be called last in the PME GPU destructor.
 329  *
 330  * \param[in] pmeGpu  The PME GPU structure.
 331  */
 332 void pme_gpu_destroy_specific(const PmeGpu *pmeGpu);
 333
 334 /*! \libinternal \brief
 335  * Initializes the CUDA FFT structures.
 336  *
 337  * \param[in] pmeGpu  The PME GPU structure.
 338  */
 339 void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu);
 340
 341 /*! \libinternal \brief
 342  * Destroys the CUDA FFT structures.
 343  *
 344  * \param[in] pmeGpu  The PME GPU structure.
 345  */
 346 void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu);
 347
 348 /* Several GPU event-based timing functions that live in pme_gpu_timings.cpp */
 349
 350 /*! \libinternal \brief
 351  * Finalizes all the active PME GPU stage timings for the current computation. Should be called at the end of every computation.
 352  *
 353  * \param[in] pmeGpu         The PME GPU structure.
 354  */
 355 void pme_gpu_update_timings(const PmeGpu *pmeGpu);
 356
 357 /*! \libinternal \brief
 358  * Updates the internal list of active PME GPU stages (if timings are enabled).
 359  *
 360  * \param[in] pmeGpu         The PME GPU data structure.
 361  */
 362 void pme_gpu_reinit_timings(const PmeGpu *pmeGpu);
 363
 364 /*! \brief
 365  * Resets the PME GPU timings. To be called at the reset MD step.
 366  *
 367  * \param[in] pmeGpu         The PME GPU structure.
 368  */
 369 void pme_gpu_reset_timings(const PmeGpu *pmeGpu);
 370
 371 /*! \libinternal \brief
 372  * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
 373  *
 374  * \param[in] pmeGpu         The PME GPU structure.
 375  * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
 376  */
 377 void pme_gpu_get_timings(const PmeGpu            *pmeGpu,
 378                          gmx_wallclock_gpu_pme_t *timings);
 379
 380 /* The PME stages themselves */
 381
 382 /*! \libinternal \brief
 383  * A GPU spline computation and charge spreading function.
 384  *
 385  * \param[in]  pmeGpu          The PME GPU structure.
 386  * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
 387  *                             can be nullptr when invoked on a separate PME rank or from PME tests.
 388  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 389  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
 390  *                             e.g. testing or host-side FFT)
 391  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
 392  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
 393  */
 394 GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu         *GPU_FUNC_ARGUMENT(pmeGpu),
 395                                        GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
 396                                        int                   GPU_FUNC_ARGUMENT(gridIndex),
 397                                        real                 *GPU_FUNC_ARGUMENT(h_grid),
 398                                        bool                  GPU_FUNC_ARGUMENT(computeSplines),
 399                                        bool                  GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
 400
 401 /*! \libinternal \brief
 402  * 3D FFT R2C/C2R routine.
 403  *
 404  * \param[in]  pmeGpu          The PME GPU structure.
 405  * \param[in]  direction       Transform direction (real-to-complex or complex-to-real)
 406  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
 407  */
 408 void pme_gpu_3dfft(const PmeGpu          *pmeGpu,
 409                    enum gmx_fft_direction direction,
 410                    int                    gridIndex);
 411
 412 /*! \libinternal \brief
 413  * A GPU Fourier space solving function.
 414  *
 415  * \param[in]     pmeGpu                  The PME GPU structure.
 416  * \param[in,out] h_grid                  The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
 417  * \param[in]     gridOrdering            Specifies the dimenion ordering of the complex grid. TODO: store this information?
 418  * \param[in]     computeEnergyAndVirial  Tells if the energy and virial computation should also be performed.
 419  */
 420 GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu),
 421                                       t_complex       *GPU_FUNC_ARGUMENT(h_grid),
 422                                       GridOrdering     GPU_FUNC_ARGUMENT(gridOrdering),
 423                                       bool             GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
 424
 425 /*! \libinternal \brief
 426  * A GPU force gathering function.
 427  *
 428  * \param[in]     pmeGpu           The PME GPU structure.
 429  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
 430  *                                 TODO: determine efficiency/balance of host/device-side reductions.
 431  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
 432  */
 433 GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(pmeGpu),
 434                                        PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
 435                                        const float           *GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
 436
 437 /*! \brief Return pointer to device copy of coordinate data.
 438  * \param[in] pmeGpu         The PME GPU structure.
 439  * \returns                  Pointer to coordinate data
 440  */
 441 GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
 442
 443 /*! \brief Sets the device pointer to coordinate data
 444  * \param[in] pmeGpu         The PME GPU structure.
 445  * \param[in] d_x            Pointer to coordinate data
 446  */
 447 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu        *GPU_FUNC_ARGUMENT(pmeGpu),
 448                                                             DeviceBuffer<float>  GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
 449
 450 /*! \brief Return pointer to device copy of force data.
 451  * \param[in] pmeGpu         The PME GPU structure.
 452  * \returns                  Pointer to force data
 453  */
 454 GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 455
 456 /*! \brief Return pointer to GPU stream.
 457  * \param[in] pmeGpu         The PME GPU structure.
 458  * \returns                  Pointer to stream object.
 459  */
 460 GPU_FUNC_QUALIFIER void * pme_gpu_get_stream(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 461
 462 /*! \brief Return pointer to GPU context (for OpenCL builds).
 463  * \param[in] pmeGpu         The PME GPU structure.
 464  * \returns                  Pointer to context object.
 465  */
 466 GPU_FUNC_QUALIFIER void * pme_gpu_get_context(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 467
 468 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
 469  * \param[in] pmeGpu         The PME GPU structure.
 470  * \returns                  Pointer to sync object
 471  */
 472 GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 473
 474 /* The inlined convenience PME GPU status getters */
 475
 476 /*! \libinternal \brief
 477  * Tells if PME runs on multiple GPUs with the decomposition.
 478  *
 479  * \param[in] pmeGpu         The PME GPU structure.
 480  * \returns                  True if PME runs on multiple GPUs, false otherwise.
 481  */
 482 inline bool pme_gpu_uses_dd(const PmeGpu *pmeGpu)
 483 {
 484     return !pmeGpu->settings.useDecomposition;
 485 }
 486
 487 /*! \libinternal \brief
 488  * Tells if PME performs the gathering stage on GPU.
 489  *
 490  * \param[in] pmeGpu         The PME GPU structure.
 491  * \returns                  True if the gathering is performed on GPU, false otherwise.
 492  */
 493 inline bool pme_gpu_performs_gather(const PmeGpu *pmeGpu)
 494 {
 495     return pmeGpu->settings.performGPUGather;
 496 }
 497
 498 /*! \libinternal \brief
 499  * Tells if PME performs the FFT stages on GPU.
 500  *
 501  * \param[in] pmeGpu         The PME GPU structure.
 502  * \returns                  True if FFT is performed on GPU, false otherwise.
 503  */
 504 inline bool pme_gpu_performs_FFT(const PmeGpu *pmeGpu)
 505 {
 506     return pmeGpu->settings.performGPUFFT;
 507 }
 508
 509 /*! \libinternal \brief
 510  * Tells if PME performs the grid (un-)wrapping on GPU.
 511  *
 512  * \param[in] pmeGpu         The PME GPU structure.
 513  * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
 514  */
 515 inline bool pme_gpu_performs_wrapping(const PmeGpu *pmeGpu)
 516 {
 517     return pmeGpu->settings.useDecomposition;
 518 }
 519
 520 /*! \libinternal \brief
 521  * Tells if PME performs the grid solving on GPU.
 522  *
 523  * \param[in] pmeGpu         The PME GPU structure.
 524  * \returns                  True if solving is performed on GPU, false otherwise.
 525  */
 526 inline bool pme_gpu_performs_solve(const PmeGpu *pmeGpu)
 527 {
 528     return pmeGpu->settings.performGPUSolve;
 529 }
 530
 531 /*! \libinternal \brief
 532  * Enables or disables the testing mode.
 533  * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
 534  * and also makes the copies synchronous.
 535  *
 536  * \param[in] pmeGpu             The PME GPU structure.
 537  * \param[in] testing            Should the testing mode be enabled, or disabled.
 538  */
 539 inline void pme_gpu_set_testing(PmeGpu *pmeGpu, bool testing)
 540 {
 541     if (pmeGpu)
 542     {
 543         pmeGpu->settings.copyAllOutputs = testing;
 544         pmeGpu->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 545     }
 546 }
 547
 548 /*! \libinternal \brief
 549  * Tells if PME is in the testing mode.
 550  *
 551  * \param[in] pmeGpu             The PME GPU structure.
 552  * \returns                      true if testing mode is enabled, false otherwise.
 553  */
 554 inline bool pme_gpu_is_testing(const PmeGpu *pmeGpu)
 555 {
 556     return pmeGpu->settings.copyAllOutputs;
 557 }
 558
 559 /* A block of C++ functions that live in pme_gpu_internal.cpp */
 560
 561 /*! \libinternal \brief
 562  * Returns the energy and virial GPU outputs, useful for testing.
 563  *
 564  * It is the caller's responsibility to be aware of whether the GPU
 565  * handled the solve stage.
 566  *
 567  * \param[in] pme                The PME structure.
 568  * \param[out] output            Pointer to output where energy and virial should be stored.
 569  */
 570 GPU_FUNC_QUALIFIER void
 571     pme_gpu_getEnergyAndVirial(const gmx_pme_t &GPU_FUNC_ARGUMENT(pme),
 572                                PmeOutput       *GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
 573
 574 /*! \libinternal \brief
 575  * Returns the GPU outputs (forces, energy and virial)
 576  *
 577  * \param[in] pme                The PME structure.
 578  * \param[in] flags              The combination of flags that affected this PME computation.
 579  *                               The flags are the GMX_PME_ flags from pme.h.
 580  * \returns                      The output object.
 581  */
 582 GPU_FUNC_QUALIFIER PmeOutput
 583     pme_gpu_getOutput(const gmx_pme_t &GPU_FUNC_ARGUMENT(pme),
 584                       int              GPU_FUNC_ARGUMENT(flags)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput {});
 585
 586 /*! \libinternal \brief
 587  * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
 588  *
 589  * \param[in] pmeGpu         The PME GPU structure.
 590  * \param[in] box            The unit cell box.
 591  */
 592 GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
 593                                                  const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM;
 594
 595 /*! \libinternal \brief
 596  * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
 597  * If forces were computed, they will have arrived at the external host buffer provided to gather.
 598  * If virial/energy were computed, they will have arrived into the internal staging buffer
 599  * (even though that should have already happened before even launching the gather).
 600  * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
 601  * Additionally, device-side buffers are cleared asynchronously for the next computation.
 602  *
 603  * \param[in] pmeGpu         The PME GPU structure.
 604  */
 605 void pme_gpu_finish_computation(const PmeGpu *pmeGpu);
 606
 607 //! A binary enum for spline data layout transformation
 608 enum class PmeLayoutTransform
 609 {
 610     GpuToHost,
 611     HostToGpu
 612 };
 613
 614 /*! \libinternal \brief
 615  * Rearranges the atom spline data between the GPU and host layouts.
 616  * Only used for test purposes so far, likely to be horribly slow.
 617  *
 618  * \param[in]  pmeGpu     The PME GPU structure.
 619  * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
 620  * \param[in]  type       The spline data type (values or derivatives).
 621  * \param[in]  dimIndex   Dimension index.
 622  * \param[in]  transform  Layout transform type
 623  */
 624 GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
 625                                                            const PmeAtomComm *GPU_FUNC_ARGUMENT(atc),
 626                                                            PmeSplineDataType GPU_FUNC_ARGUMENT(type),
 627                                                            int GPU_FUNC_ARGUMENT(dimIndex),
 628                                                            PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM;
 629
 630 /*! \libinternal \brief
 631  * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
 632  * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
 633  * in range(0, atomsPerBlock * order * DIM).
 634  * This is a wrapper, only used in unit tests.
 635  * \param[in] order            PME order
 636  * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
 637  * \param[in] dimIndex         Dimension index (from 0 to 2)
 638  * \param[in] atomIndex        Atom index wrt the block.
 639  * \param[in] atomsPerWarp     Number of atoms processed by a warp.
 640  *
 641  * \returns Index into theta or dtheta array using GPU layout.
 642  */
 643 int getSplineParamFullIndex(int order,
 644                             int splineIndex,
 645                             int dimIndex,
 646                             int atomIndex,
 647                             int atomsPerWarp);
 648
 649 /*! \libinternal \brief
 650  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
 651  *
 652  * \param[in] pmeGpu             The PME GPU structure.
 653  * \param[out] gridSize          Pointer to the grid dimensions to fill in.
 654  * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
 655  */
 656 GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
 657                                                     gmx::IVec *GPU_FUNC_ARGUMENT(gridSize),
 658                                                     gmx::IVec *GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM;
 659
 660 /*! \libinternal \brief
 661  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
 662  *
 663  * \param[in,out] pme             The PME structure.
 664  * \param[in]     gpuInfo         The GPU information structure.
 665  * \param[in]     pmeGpuProgram   The PME GPU program data
 666  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
 667  */
 668 GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
 669                                        const gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
 670                                        PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
 671
 672 /*! \libinternal \brief
 673  * Destroys the PME GPU data at the end of the run.
 674  *
 675  * \param[in] pmeGpu     The PME GPU structure.
 676  */
 677 GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
 678
 679 /*! \libinternal \brief
 680  * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
 681  *
 682  * \param[in] pmeGpu    The PME GPU structure.
 683  * \param[in] nAtoms    The number of particles.
 684  * \param[in] charges   The pointer to the host-side array of particle charges.
 685  *
 686  * This is a function that should only be called in the beginning of the run and on domain decomposition.
 687  * Should be called before the pme_gpu_set_io_ranges.
 688  */
 689 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
 690                                              int         GPU_FUNC_ARGUMENT(nAtoms),
 691                                              const real       *GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM;
 692
 693 /*! \brief \libinternal
 694  * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
 695  *
 696  * This clears the device-side working buffers in preparation for new computation.
 697  *
 698  * \param[in] pmeGpu            The PME GPU structure.
 699  */
 700 void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);
 701
 702 /*! \brief
 703  * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
 704  * (if they were to be computed).
 705  *
 706  * \param[in]  pme            The PME data structure.
 707  * \param[in]  flags          The combination of flags to affect this PME computation.
 708  *                            The flags are the GMX_PME_ flags from pme.h.
 709  * \param[out] wcycle         The wallclock counter.
 710  * \return     The output forces, energy and virial
 711  */
 712 GPU_FUNC_QUALIFIER PmeOutput
 713     pme_gpu_wait_finish_task(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
 714                              int                   GPU_FUNC_ARGUMENT(flags),
 715                              gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput {}
 716                                                                                                         );
 717
 718 #endif