src/gromacs/ewald/pme_gpu_types.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief Defines the PME GPU data structures
  38  * (the GPU function parameters used both on host and device sides).
  39  *
  40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  41  * \ingroup module_ewald
  42  */
  43
  44 #ifndef GMX_EWALD_PME_GPU_TYPES_H
  45 #define GMX_EWALD_PME_GPU_TYPES_H
  46
  47 /*
  48  * In OpenCL, the structures must be laid out on the host and device exactly the same way.
  49  * If something is off, one might get an error CL_INVALID_ARG_SIZE if any structure's sizes don't
  50  * match. What's worse, structures might be of same size but members might be aligned differently,
  51  * resulting in wrong kernel results. The structures below are aligned manually.
  52  * The pattern is ordering the members of structs from smallest to largest sizeof
  53  * (arrays behave the same way as sequences of separate fields),
  54  * as described in "The Lost Art of C Structure Packing".
  55  *
  56  * However, if the need arises at some point, they can all be aligned forcefully:
  57  *
  58  * #define GMX_GPU_ALIGNED __attribute__ ((aligned(8)))
  59  * struct GMX_GPU_ALIGNED PmeGpuConstParams
  60  * struct GMX_GPU_ALIGNED PmeGpuGridParams
  61  * etc...
  62  *
  63  * One might also try __attribute__ ((packed)), but it doesn't work with DeviceBuffer,
  64  * as it appears to not be POD.
  65  */
  66
  67
  68 /*! \brief A workaround to hide DeviceBuffer template from OpenCL kernel compilation
  69  * - to turn it into a dummy of the same size as host implementation of device buffer.
  70  * As we only care about 64-bit, 8 bytes is fine.
  71  * TODO: what we should be doing is providing separate device-side views of the same structures -
  72  * then there would be no need for macro.
  73  */
  74 #ifndef __OPENCL_C_VERSION__
  75 #    include "gromacs/gpu_utils/devicebuffer.h"
  76 #    define HIDE_FROM_OPENCL_COMPILER(x) x
  77 static_assert(sizeof(DeviceBuffer<float>) == 8,
  78               "DeviceBuffer is defined as an 8 byte stub for OpenCL C");
  79 static_assert(sizeof(DeviceBuffer<int>) == 8,
  80               "DeviceBuffer is defined as an 8 byte stub for OpenCL C");
  81 #else
  82 #    define HIDE_FROM_OPENCL_COMPILER(x) char8
  83 #endif
  84
  85 #ifndef NUMFEPSTATES
  86 //! Number of FEP states.
  87 #    define NUMFEPSTATES 2
  88 #endif
  89
  90 /* What follows is all the PME GPU function arguments,
  91  * sorted into several device-side structures depending on the update rate.
  92  * This is GPU agnostic (float3 replaced by float[3], etc.).
  93  * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described
  94  * in the larger structure PmeGpuCudaKernelParams in the pme.cuh.
  95  */
  96
  97 /*! \internal \brief
  98  * A GPU data structure for storing the constant PME data.
  99  * This only has to be initialized once.
 100  */
 101 struct PmeGpuConstParams
 102 {
 103     /*! \brief Electrostatics coefficient = c_one4PiEps0 / pme->epsilon_r */
 104     float elFactor;
 105     /*! \brief Virial and energy GPU array. Size is c_virialAndEnergyCount (7) floats.
 106      * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
 107     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy[NUMFEPSTATES];
 108 };
 109
 110 /*! \internal \brief
 111  * A GPU data structure for storing the PME data related to the grid sizes and cut-off.
 112  * This only has to be updated at every DD step.
 113  */
 114 struct PmeGpuGridParams
 115 {
 116     /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
 117     float ewaldFactor;
 118
 119     /* Grid sizes */
 120     /*! \brief Real-space grid data dimensions. */
 121     int realGridSize[DIM];
 122     /*! \brief Real-space grid dimensions, only converted to floating point. */
 123     float realGridSizeFP[DIM];
 124     /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */
 125     int realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */
 126     /*! \brief Fourier grid dimensions. This counts the complex numbers! */
 127     int complexGridSize[DIM];
 128     /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
 129     int complexGridSizePadded[DIM];
 130
 131     /*! \brief Offsets for X/Y/Z components of d_splineModuli */
 132     int splineValuesOffset[DIM];
 133     /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
 134     int tablesOffsets[DIM];
 135
 136     /* Grid arrays */
 137     /*! \brief Real space grid. */
 138     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid[NUMFEPSTATES];
 139     /*! \brief Complex grid - used in FFT/solve. If inplace cu/clFFT is used, then it is the same handle as realGrid. */
 140     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid[NUMFEPSTATES];
 141
 142     /*! \brief Grid spline values as in pme->bsp_mod
 143      * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
 144      */
 145     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli[NUMFEPSTATES];
 146     /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
 147     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fractShiftsTable;
 148     /*! \brief Gridline indices lookup table
 149      * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
 150     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<int>) d_gridlineIndicesTable;
 151 };
 152
 153 /*! \internal \brief
 154  * A GPU data structure for storing the PME data of the atoms, local to this process' domain
 155  * partition. This only has to be updated every DD step.
 156  */
 157 struct PmeGpuAtomParams
 158 {
 159     /*! \brief Number of local atoms */
 160     int nAtoms;
 161     /*! \brief Global GPU memory array handle with input rvec atom coordinates.
 162      * The coordinates themselves change and need to be copied to the GPU for every PME computation,
 163      * but reallocation happens only at DD.
 164      */
 165     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
 166     /*! \brief Global GPU memory array handle with input atom charges in states A and B.
 167      * The charges only need to be reallocated and copied to the GPU at DD step.
 168      */
 169     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients[NUMFEPSTATES];
 170     /*! \brief Global GPU memory array handle with input/output rvec atom forces.
 171      * The forces change and need to be copied from (and possibly to) the GPU for every PME
 172      * computation, but reallocation happens only at DD.
 173      */
 174     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_forces;
 175     /*! \brief Global GPU memory array handle with ivec atom gridline indices.
 176      * Computed on GPU in the spline calculation part.
 177      */
 178     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<int>) d_gridlineIndices;
 179     /* B-spline parameters are computed entirely on GPU for every PME computation, not copied.
 180      * Unless we want to try something like GPU spread + CPU gather?
 181      */
 182     /*! \brief Global GPU memory array handle with B-spline values */
 183     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_theta;
 184     /*! \brief Global GPU memory array handle with B-spline derivative values */
 185     HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_dtheta;
 186 };
 187
 188 /*! \internal \brief
 189  * A GPU data structure for storing the PME data which might change for each new PME computation.
 190  */
 191 struct PmeGpuDynamicParams
 192 {
 193     /* The box parameters. The box only changes size with pressure coupling enabled. */
 194     /*! \brief
 195      * Reciprocal (inverted unit cell) box.
 196      *
 197      * The box is transposed as compared to the CPU pme->recipbox.
 198      * Basically, spread uses matrix columns (while solve and gather use rows).
 199      * This storage format might be not the most optimal since the box is always triangular so there are zeroes.
 200      */
 201     float recipBox[DIM][DIM];
 202     /*! \brief The unit cell volume for solving. */
 203     float boxVolume;
 204
 205     /*! \brief The current coefficient scaling value. */
 206     float scale;
 207 };
 208
 209 /*! \internal \brief
 210  * A single structure encompassing all the PME data used in GPU kernels on device.
 211  * To extend the list with platform-specific parameters, this can be inherited by the
 212  * GPU framework-specific structure.
 213  */
 214 struct PmeGpuKernelParamsBase
 215 {
 216     /*! \brief Constant data that is set once. */
 217     struct PmeGpuConstParams constants;
 218     /*! \brief Data dependent on the grid size/cutoff. */
 219     struct PmeGpuGridParams grid;
 220     /*! \brief Data dependent on the DD and local atoms. */
 221     struct PmeGpuAtomParams atoms;
 222     /*! \brief Data that possibly changes for every new PME computation.
 223      * This should be kept up-to-date by calling pme_gpu_prepare_computation(...)
 224      * before launching spreading.
 225      */
 226     struct PmeGpuDynamicParams current;
 227
 228     /*! \brief Whether pipelining with PP communications is active
 229      * char rather than bool to avoid problem with OpenCL compiler */
 230     char usePipeline;
 231     /*! \brief Start atom for this stage of pipeline */
 232     int pipelineAtomStart;
 233     /*! \brief End atom for this stage of pipeline */
 234     int pipelineAtomEnd;
 235
 236     /* These texture objects are only used in CUDA and are related to the grid size. */
 237     /*! \brief Texture object for accessing grid.d_fractShiftsTable */
 238     HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
 239     /*! \brief Texture object for accessing grid.d_gridlineIndicesTable */
 240     HIDE_FROM_OPENCL_COMPILER(DeviceTexture) gridlineIndicesTableTexture;
 241 };
 242
 243 #endif