src/gromacs/ewald/pme_gpu_types_host.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \libinternal \file
  37  * \brief Defines the host-side PME GPU data structures.
  38  * \todo Some renaming/refactoring, which does not impair the performance:
  39  * -- bringing the function names up to guidelines
  40  * -- PmeGpuSettings -> PmeGpuTasks
  41  * -- refining GPU notation application (#2053)
  42  * -- renaming coefficients to charges (?)
  43  *
  44  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  45  * \ingroup module_ewald
  46  */
  47
  48 #ifndef GMX_EWALD_PME_GPU_TYPES_HOST_H
  49 #define GMX_EWALD_PME_GPU_TYPES_HOST_H
  50
  51 #include "config.h"
  52
  53 #include <memory>
  54 #include <vector>
  55
  56 #include "gromacs/ewald/pme.h"
  57 #include "gromacs/ewald/pme_gpu_program.h"
  58 #include "gromacs/gpu_utils/clfftinitializer.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h" // for GpuApiCallBehavior
  60 #include "gromacs/gpu_utils/hostallocator.h"
  61 #include "gromacs/math/vectypes.h"
  62
  63 #if GMX_GPU != GMX_GPU_NONE
  64 struct PmeGpuSpecific;
  65 #else
  66 /*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */
  67 typedef int PmeGpuSpecific;
  68 #endif
  69
  70 #if GMX_GPU == GMX_GPU_CUDA
  71 struct PmeGpuCudaKernelParams;
  72 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
  73 typedef PmeGpuCudaKernelParams PmeGpuKernelParams;
  74 #elif GMX_GPU == GMX_GPU_OPENCL
  75 struct PmeGpuKernelParamsBase;
  76 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
  77 typedef PmeGpuKernelParamsBase PmeGpuKernelParams;
  78 #else
  79 /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
  80 typedef int PmeGpuKernelParams;
  81 #endif
  82
  83 struct gmx_device_info_t;
  84
  85 /*! \internal \brief
  86  * The PME GPU settings structure, included in the main PME GPU structure by value.
  87  */
  88 struct PmeGpuSettings
  89 {
  90     /* Permanent settings set on initialization */
  91     /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */
  92     bool performGPUSolve;
  93     /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */
  94     bool performGPUGather;
  95     /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */
  96     bool performGPUFFT;
  97     /*! \brief A convenience boolean which tells if PME decomposition is used. */
  98     bool useDecomposition;
  99     /*! \brief True if PME forces are reduced on-GPU, false if reduction is done on the CPU;
 100      *  in the former case transfer does not need to happen.
 101      *
 102      *  Note that this flag may change per-step.
 103      */
 104     bool useGpuForceReduction;
 105
 106     /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the
 107      * host. Only intended to be used by the test framework.
 108      */
 109     bool copyAllOutputs;
 110     /*! \brief An enum which tells whether most PME GPU D2H/H2D data transfers should be synchronous. */
 111     GpuApiCallBehavior transferKind;
 112     /*! \brief Various flags for the current PME computation, corresponding to the GMX_PME_ flags in pme.h. */
 113     int currentFlags;
 114     /*! \brief
 115      *  Currently only supported by CUDA.
 116      *  Controls if we should use order (i.e. 4) threads per atom for the GPU
 117      *  or order*order (i.e. 16) threads per atom.
 118      */
 119     bool useOrderThreadsPerAtom;
 120     /*! \brief
 121      * Currently only supported by CUDA.
 122      * Controls if we should recalculate the splines in the gather or
 123      * save the values in the spread and reload in the gather.
 124      */
 125     bool recalculateSplines;
 126 };
 127
 128 // TODO There's little value in computing the Coulomb and LJ virial
 129 // separately, so we should simplify that.
 130 // TODO The matrices might be best as a view, but not currently
 131 // possible. Use mdspan?
 132 struct PmeOutput
 133 {
 134     gmx::ArrayRef<gmx::RVec> forces_; //!< Host staging area for PME forces
 135     bool                     haveForceOutput_ =
 136             false; //!< True if forces have been staged other false (when forces are reduced on the GPU).
 137     real   coulombEnergy_ = 0;         //!< Host staging area for PME coulomb energy
 138     matrix coulombVirial_ = { { 0 } }; //!< Host staging area for PME coulomb virial contributions
 139     real   lennardJonesEnergy_ = 0;    //!< Host staging area for PME LJ energy
 140     matrix lennardJonesVirial_ = { { 0 } }; //!< Host staging area for PME LJ virial contributions
 141 };
 142
 143 /*! \internal \brief
 144  * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
 145  * Buffers are managed by the PME GPU module.
 146  */
 147 struct PmeGpuStaging
 148 {
 149     //! Host-side force buffer
 150     gmx::PaddedHostVector<gmx::RVec> h_forces;
 151
 152     /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
 153     float* h_virialAndEnergy;
 154     /*! \brief B-spline values intermediate host-side buffer. */
 155     float* h_splineModuli;
 156
 157     /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
 158     float* h_theta;
 159     /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */
 160     float* h_dtheta;
 161     /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */
 162     int* h_gridlineIndices;
 163 };
 164
 165 /*! \internal \brief
 166  * The PME GPU structure for all the data copied directly from the CPU PME structure.
 167  * The copying is done when the CPU PME structure is already (re-)initialized
 168  * (pme_gpu_reinit is called at the end of gmx_pme_init).
 169  * All the variables here are named almost the same way as in gmx_pme_t.
 170  * The types are different: pointers are replaced by vectors.
 171  * TODO: use the shared data with the PME CPU.
 172  * Included in the main PME GPU structure by value.
 173  */
 174 struct PmeShared
 175 {
 176     /*! \brief Grid count - currently always 1 on GPU */
 177     int ngrids;
 178     /*! \brief Grid dimensions - nkx, nky, nkz */
 179     int nk[DIM];
 180     /*! \brief PME interpolation order */
 181     int pme_order;
 182     /*! \brief Ewald splitting coefficient for Coulomb */
 183     real ewaldcoeff_q;
 184     /*! \brief Electrostatics parameter */
 185     real epsilon_r;
 186     /*! \brief Gridline indices - nnx, nny, nnz */
 187     std::vector<int> nn;
 188     /*! \brief Fractional shifts - fshx, fshy, fshz */
 189     std::vector<real> fsh;
 190     /*! \brief Precomputed B-spline values */
 191     std::vector<real> bsp_mod[DIM];
 192     /*! \brief The PME codepath being taken */
 193     PmeRunMode runMode;
 194     /*! \brief  Whether PME execution is happening on a PME-only rank (from gmx_pme_t.bPPnode). */
 195     bool isRankPmeOnly;
 196     /*! \brief The box scaler based on inputrec - created in pme_init and managed by CPU structure */
 197     class EwaldBoxZScaler* boxScaler;
 198     /*! \brief The previous computation box to know if we even need to update the current box params.
 199      * \todo Manage this on higher level.
 200      * \todo Alternatively, when this structure is used by CPU PME code, make use of this field there as well.
 201      */
 202     matrix previousBox;
 203 };
 204
 205 /*! \internal \brief
 206  * The main PME GPU host structure, included in the PME CPU structure by pointer.
 207  */
 208 struct PmeGpu
 209 {
 210     /*! \brief The information copied once per reinit from the CPU structure. */
 211     std::shared_ptr<PmeShared> common; // TODO: make the CPU structure use the same type
 212
 213     //! A handle to the program created by buildPmeGpuProgram()
 214     PmeGpuProgramHandle programHandle_;
 215
 216     //! Handle that ensures the clFFT library has been initialized once per process.
 217     std::unique_ptr<gmx::ClfftInitializer> initializedClfftLibrary_;
 218
 219     /*! \brief The settings. */
 220     PmeGpuSettings settings;
 221
 222     /*! \brief The host-side buffers.
 223      * The device-side buffers are buried in kernelParams, but that will have to change.
 224      */
 225     PmeGpuStaging staging;
 226
 227     /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
 228      * Used for kernel scheduling.
 229      * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
 230      * TODO: this and the next member represent a memory allocation/padding properties -
 231      * what a container type should do ideally.
 232      */
 233     int nAtomsPadded;
 234     /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
 235      * if c_usePadding is true.
 236      * Used only as a basic size for almost all the atom data allocations
 237      * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
 238      * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
 239      * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
 240      */
 241     int nAtomsAlloc;
 242
 243     /*! \brief A pointer to the device used during the execution. */
 244     const gmx_device_info_t* deviceInfo;
 245
 246     /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
 247      * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
 248      * OpenCL seems to not have readily available global work size limit, so we just assign a large arbitrary constant to this instead.
 249      * TODO: this should be in PmeGpuProgram(Impl)
 250      */
 251     std::intmax_t maxGridWidthX;
 252
 253     /*! \brief A single structure encompassing all the PME data used on GPU.
 254      * Its value is the only argument to all the PME GPU kernels.
 255      * \todo Test whether this should be copied to the constant GPU memory once for each computation
 256      * (or even less often with no box updates) instead of being an argument.
 257      */
 258     std::shared_ptr<PmeGpuKernelParams> kernelParams;
 259
 260     /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */
 261     std::shared_ptr<PmeGpuSpecific> archSpecific; /* FIXME: make it an unique_ptr */
 262 };
 263
 264 #endif