src/gromacs/ewald/pme-gpu-types.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \libinternal \file
  37  * \brief Defines the GPU-agnostic PME GPU data structures
  38  * (the host-side PME GPU data, and the GPU function parameters).
  39  * \todo Due to Gerrit workflow and time constraints, some renaming/refactoring
  40  * which does not impair the performance will be performed once
  41  * most of the initial PME CUDA implementation is merged
  42  * into the master branch (likely, after release 2017).
  43  * This should include:
  44  * -- bringing the structure/function names up to guidelines
  45  * ---- pme_gpu_settings_t -> PmeGpuTasks
  46  * -- refining GPU notation application (#2053)
  47  * -- renaming coefficients to charges (?)
  48  *
  49  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  50  * \ingroup module_ewald
  51  */
  52
  53 #ifndef GMX_EWALD_PME_GPU_TYPES_H
  54 #define GMX_EWALD_PME_GPU_TYPES_H
  55
  56 #include "config.h"
  57
  58 #include <memory>
  59 #include <vector>
  60
  61 #include "gromacs/math/vectypes.h"
  62 #include "gromacs/utility/basedefinitions.h"
  63
  64 struct gmx_hw_info;
  65 struct gmx_device_info_t;
  66
  67 /*! \brief Possible PME codepaths
  68  * \todo: make this enum class with gmx_pme_t C++ refactoring
  69  */
  70 enum PmeRunMode
  71 {
  72     CPU,     //!< Whole PME step is done on CPU
  73     GPU,     //!< Whole PME step is done on GPU
  74     Hybrid,  //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU.
  75 };
  76
  77 #if GMX_GPU == GMX_GPU_CUDA
  78
  79 struct pme_gpu_cuda_t;
  80 /*! \brief A typedef for including the GPU host data by pointer */
  81 typedef pme_gpu_cuda_t pme_gpu_specific_t;
  82
  83 struct pme_gpu_cuda_kernel_params_t;
  84 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
  85 typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t;
  86
  87 #else
  88
  89 /*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */
  90 typedef int pme_gpu_specific_t;
  91 /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
  92 typedef int pme_gpu_kernel_params_t;
  93
  94 #endif
  95
  96 /* What follows is all the PME GPU function arguments,
  97  * sorted into several device-side structures depending on the update rate.
  98  * This is GPU agnostic (float3 replaced by float[3], etc.).
  99  * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described
 100  * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh.
 101  */
 102
 103 /*! \internal \brief
 104  * A GPU data structure for storing the constant PME data.
 105  * This only has to be initialized once.
 106  */
 107 struct pme_gpu_const_params_t
 108 {
 109     /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */
 110     float elFactor;
 111     /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
 112      * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
 113     float *d_virialAndEnergy;
 114 };
 115
 116 /*! \internal \brief
 117  * A GPU data structure for storing the PME data related to the grid sizes and cut-off.
 118  * This only has to be updated at every DD step.
 119  */
 120 struct pme_gpu_grid_params_t
 121 {
 122     /* Grid sizes */
 123     /*! \brief Real-space grid data dimensions. */
 124     int   realGridSize[DIM];
 125     /*! \brief Real-space grid dimensions, only converted to floating point. */
 126     float realGridSizeFP[DIM];
 127     /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */
 128     int   realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */
 129     /*! \brief Fourier grid dimensions. This counts the complex numbers! */
 130     int   complexGridSize[DIM];
 131     /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
 132     int   complexGridSizePadded[DIM];
 133
 134     /* Grid pointers */
 135     /*! \brief Real space grid. */
 136     float *d_realGrid;
 137     /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
 138     float *d_fourierGrid;
 139
 140     /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
 141     float ewaldFactor;
 142
 143     /*! \brief Grid spline values as in pme->bsp_mod
 144      * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
 145      */
 146     float              *d_splineModuli;
 147     /*! \brief Offsets for X/Y/Z components of d_splineModuli */
 148     int                 splineValuesOffset[DIM];
 149
 150     /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
 151     float               *d_fractShiftsTable;
 152     /*! \brief Gridline indices lookup table
 153      * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
 154     int                *d_gridlineIndicesTable;
 155     /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
 156     int                 tablesOffsets[DIM];
 157 };
 158
 159 /*! \internal \brief
 160  * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition.
 161  * This only has to be updated every DD step.
 162  */
 163 struct pme_gpu_atom_params_t
 164 {
 165     /*! \brief Number of local atoms */
 166     int    nAtoms;
 167     /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
 168      * The coordinates themselves change and need to be copied to the GPU every MD step,
 169      * but reallocation happens only at DD.
 170      */
 171     float *d_coordinates;
 172     /*! \brief Pointer to the global GPU memory with input atom charges.
 173      * The charges only need to be reallocated and copied to the GPU at DD step.
 174      */
 175     float  *d_coefficients;
 176     /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
 177      * The forces change and need to be copied from (and possibly to) the GPU every MD step,
 178      * but reallocation happens only at DD.
 179      */
 180     float  *d_forces;
 181     /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
 182      * Computed on GPU in the spline calculation part.
 183      */
 184     int *d_gridlineIndices;
 185
 186     /* B-spline parameters are computed entirely on GPU every MD step, not copied.
 187      * Unless we want to try something like GPU spread + CPU gather?
 188      */
 189     /*! \brief Pointer to the global GPU memory with B-spline values */
 190     float  *d_theta;
 191     /*! \brief Pointer to the global GPU memory with B-spline derivative values */
 192     float  *d_dtheta;
 193 };
 194
 195 /*! \internal \brief
 196  * A GPU data structure for storing the PME data which might change every MD step.
 197  */
 198 struct pme_gpu_step_params_t
 199 {
 200     /* The box parameters. The box only changes size each step with pressure coupling enabled. */
 201     /*! \brief
 202      * Reciprocal (inverted unit cell) box.
 203      *
 204      * The box is transposed as compared to the CPU pme->recipbox.
 205      * Basically, spread uses matrix columns (while solve and gather use rows).
 206      * This storage format might be not the most optimal since the box is always triangular so there are zeroes.
 207      */
 208     float  recipBox[DIM][DIM];
 209     /*! \brief The unit cell volume for solving. */
 210     float  boxVolume;
 211 };
 212
 213 /*! \internal \brief
 214  * A single structure encompassing almost all the PME data used in GPU kernels on device.
 215  * This is inherited by the GPU framework-specific structure
 216  * (pme_gpu_cuda_kernel_params_t in pme.cuh).
 217  * This way, most code preparing the kernel parameters can be GPU-agnostic by casting
 218  * the kernel parameter data pointer to pme_gpu_kernel_params_base_t.
 219  */
 220 struct pme_gpu_kernel_params_base_t
 221 {
 222     /*! \brief Constant data that is set once. */
 223     pme_gpu_const_params_t constants;
 224     /*! \brief Data dependent on the grid size/cutoff. */
 225     pme_gpu_grid_params_t  grid;
 226     /*! \brief Data dependent on the DD and local atoms. */
 227     pme_gpu_atom_params_t  atoms;
 228     /*! \brief Data that possibly changes on every MD step. */
 229     pme_gpu_step_params_t  step;
 230 };
 231
 232 /* Here are the host-side structures */
 233
 234 /*! \internal \brief
 235  * The PME GPU settings structure, included in the main PME GPU structure by value.
 236  */
 237 struct pme_gpu_settings_t
 238 {
 239     /* Permanent settings set on initialization */
 240     /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */
 241     bool performGPUSolve;
 242     /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */
 243     bool performGPUGather;
 244     /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */
 245     bool performGPUFFT;
 246     /*! \brief A convenience boolean which tells if PME decomposition is used. */
 247     bool useDecomposition;
 248     /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
 249      * Only intended to be used by the test framework.
 250      */
 251     bool copyAllOutputs;
 252     /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */
 253     int  stepFlags;
 254 };
 255
 256 /*! \internal \brief
 257  * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
 258  * Buffers are managed by the PME GPU module.
 259  */
 260 struct pme_gpu_staging_t
 261 {
 262     /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
 263     float  *h_virialAndEnergy;
 264     /*! \brief B-spline values intermediate host-side buffer. */
 265     float  *h_splineModuli;
 266
 267     /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
 268     float  *h_theta;
 269     /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */
 270     float  *h_dtheta;
 271     /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */
 272     int    *h_gridlineIndices;
 273 };
 274
 275 /*! \internal \brief
 276  * The PME GPU structure for all the data copied directly from the CPU PME structure.
 277  * The copying is done when the CPU PME structure is already (re-)initialized
 278  * (pme_gpu_reinit is called at the end of gmx_pme_init).
 279  * All the variables here are named almost the same way as in gmx_pme_t.
 280  * The types are different: pointers are replaced by vectors.
 281  * TODO: use the shared data with the PME CPU.
 282  * Included in the main PME GPU structure by value.
 283  */
 284 struct pme_shared_t
 285 {
 286     /*! \brief Grid count - currently always 1 on GPU */
 287     int ngrids;
 288     /*! \brief Grid dimensions - nkx, nky, nkz */
 289     int nk[DIM];
 290     /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
 291      * TODO: find out if these are really needed for the CPU FFT compatibility.
 292      */
 293     int               pmegrid_n[DIM];
 294     /*! \brief PME interpolation order */
 295     int               pme_order;
 296     /*! \brief Ewald splitting coefficient for Coulomb */
 297     real              ewaldcoeff_q;
 298     /*! \brief Electrostatics parameter */
 299     real              epsilon_r;
 300     /*! \brief Gridline indices - nnx, nny, nnz */
 301     std::vector<int>  nn[DIM];
 302     /*! \brief Fractional shifts - fshx, fshy, fshz */
 303     std::vector<real> fsh[DIM];
 304     /*! \brief Precomputed B-spline values */
 305     std::vector<real> bsp_mod[DIM];
 306     /*! \brief The PME codepath being taken */
 307     PmeRunMode        runMode;
 308 };
 309
 310 /*! \internal \brief
 311  * The main PME GPU host structure, included in the PME CPU structure by pointer.
 312  */
 313 struct pme_gpu_t
 314 {
 315     /*! \brief The information copied once per reinit from the CPU structure. */
 316     std::shared_ptr<pme_shared_t> common; // TODO: make the CPU structure use the same type
 317
 318     /*! \brief The settings. */
 319     pme_gpu_settings_t settings;
 320
 321     /*! \brief The host-side buffers.
 322      * The device-side buffers are buried in kernelParams, but that will have to change.
 323      */
 324     pme_gpu_staging_t staging;
 325
 326     /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT.
 327      * Used for kernel scheduling.
 328      * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
 329      * TODO: this and the next member represent a memory allocation/padding properties -
 330      * what a container type should do ideally.
 331      */
 332     int nAtomsPadded;
 333     /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT
 334      * if c_usePadding is true.
 335      * Used only as a basic size for almost all the atom data allocations
 336      * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
 337      * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
 338      * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
 339      */
 340     int nAtomsAlloc;
 341
 342     /*! \brief A pointer to the device used during the execution. */
 343     gmx_device_info_t *deviceInfo;
 344
 345     /*! \brief A single structure encompassing all the PME data used on GPU.
 346      * Its value is the only argument to all the PME GPU kernels.
 347      * \todo Test whether this should be copied to the constant GPU memory once per MD step
 348      * (or even less often with no box updates) instead of being an argument.
 349      */
 350     std::shared_ptr<pme_gpu_kernel_params_t> kernelParams;
 351
 352     /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */
 353     std::shared_ptr<pme_gpu_specific_t> archSpecific; /* FIXME: make it an unique_ptr */
 354 };
 355
 356 #endif