src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team.
   6  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /*! \internal \file
  39  *  \brief
  40  *  Data types used internally in the nbnxn_cuda module.
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_nbnxm
  44  */
  45
  46 #ifndef NBNXM_CUDA_TYPES_H
  47 #define NBNXM_CUDA_TYPES_H
  48
  49 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  50 #include "gromacs/gpu_utils/cudautils.cuh"
  51 #include "gromacs/gpu_utils/devicebuffer.h"
  52 #include "gromacs/gpu_utils/gputraits.cuh"
  53 #include "gromacs/mdtypes/interaction_const.h"
  54 #include "gromacs/nbnxm/gpu_types_common.h"
  55 #include "gromacs/nbnxm/nbnxm.h"
  56 #include "gromacs/nbnxm/pairlist.h"
  57 #include "gromacs/timing/gpu_timing.h"
  58 #include "gromacs/utility/enumerationhelpers.h"
  59
  60 /*! \brief Macro definining default for the prune kernel's j4 processing concurrency.
  61  *
  62  *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
  63  */
  64 #ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
  65 #define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
  66 #endif
  67 /*! \brief Default for the prune kernel's j4 processing concurrency.
  68  *
  69  *  Initialized using the #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro which allows compile-time override.
  70  */
  71 const int c_cudaPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
  72
  73 /* TODO: consider moving this to kernel_utils */
  74 /* Convenience defines */
  75 /*! \brief number of clusters per supercluster. */
  76 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
  77 /*! \brief cluster size = number of atoms per cluster. */
  78 static const int c_clSize          = c_nbnxnGpuClusterSize;
  79
  80 /*! \brief Electrostatic CUDA kernel flavors.
  81  *
  82  *  Types of electrostatics implementations available in the CUDA non-bonded
  83  *  force kernels. These represent both the electrostatics types implemented
  84  *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
  85  *  enums.h) as well as encode implementation details analytical/tabulated
  86  *  and single or twin cut-off (for Ewald kernels).
  87  *  Note that the cut-off and RF kernels have only analytical flavor and unlike
  88  *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
  89  *
  90  *  The row-order of pointers to different electrostatic kernels defined in
  91  *  nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
  92  *  should match the order of enumerated types below.
  93  */
  94 enum eelCu {
  95     eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
  96 };
  97
  98 /*! \brief VdW CUDA kernel flavors.
  99  *
 100  * The enumerates values correspond to the LJ implementations in the CUDA non-bonded
 101  * kernels.
 102  *
 103  * The column-order of pointers to different electrostatic kernels defined in
 104  * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
 105  * should match the order of enumerated types below.
 106  */
 107 enum evdwCu {
 108     evdwCuCUT, evdwCuCUTCOMBGEOM, evdwCuCUTCOMBLB, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
 109 };
 110
 111 /* All structs prefixed with "cu_" hold data used in GPU calculations and
 112  * are passed to the kernels, except cu_timers_t. */
 113 /*! \cond */
 114 typedef struct cu_atomdata  cu_atomdata_t;
 115 typedef struct cu_nbparam   cu_nbparam_t;
 116 typedef struct nb_staging   nb_staging_t;
 117 /*! \endcond */
 118
 119
 120 /** \internal
 121  * \brief Staging area for temporary data downloaded from the GPU.
 122  *
 123  *  The energies/shift forces get downloaded here first, before getting added
 124  *  to the CPU-side aggregate values.
 125  */
 126 struct nb_staging
 127 {
 128     float   *e_lj;      /**< LJ energy            */
 129     float   *e_el;      /**< electrostatic energy */
 130     float3  *fshift;    /**< shift forces         */
 131 };
 132
 133 /** \internal
 134  * \brief Nonbonded atom data - both inputs and outputs.
 135  */
 136 struct cu_atomdata
 137 {
 138     int      natoms;            /**< number of atoms                              */
 139     int      natoms_local;      /**< number of local atoms                        */
 140     int      nalloc;            /**< allocation size for the atom data (xq, f)    */
 141
 142     float4  *xq;                /**< atom coordinates + charges, size natoms      */
 143     float3  *f;                 /**< force output array, size natoms              */
 144
 145     float   *e_lj;              /**< LJ energy output, size 1                     */
 146     float   *e_el;              /**< Electrostatics energy input, size 1          */
 147
 148     float3  *fshift;            /**< shift forces                                 */
 149
 150     int      ntypes;            /**< number of atom types                         */
 151     int     *atom_types;        /**< atom type indices, size natoms               */
 152     float2  *lj_comb;           /**< sqrt(c6),sqrt(c12) size natoms               */
 153
 154     float3  *shift_vec;         /**< shifts                                       */
 155     bool     bShiftVecUploaded; /**< true if the shift vector has been uploaded   */
 156 };
 157
 158 /** \internal
 159  * \brief Parameters required for the CUDA nonbonded calculations.
 160  */
 161 struct cu_nbparam
 162 {
 163
 164     int             eeltype;              /**< type of electrostatics, takes values from #eelCu */
 165     int             vdwtype;              /**< type of VdW impl., takes values from #evdwCu     */
 166
 167     float           epsfac;               /**< charge multiplication factor                      */
 168     float           c_rf;                 /**< Reaction-field/plain cutoff electrostatics const. */
 169     float           two_k_rf;             /**< Reaction-field electrostatics constant            */
 170     float           ewald_beta;           /**< Ewald/PME parameter                               */
 171     float           sh_ewald;             /**< Ewald/PME correction term substracted from the direct-space potential */
 172     float           sh_lj_ewald;          /**< LJ-Ewald/PME correction term added to the correction potential        */
 173     float           ewaldcoeff_lj;        /**< LJ-Ewald/PME coefficient                          */
 174
 175     float           rcoulomb_sq;          /**< Coulomb cut-off squared                           */
 176
 177     float           rvdw_sq;              /**< VdW cut-off squared                               */
 178     float           rvdw_switch;          /**< VdW switched cut-off                              */
 179     float           rlistOuter_sq;        /**< Full, outer pair-list cut-off squared             */
 180     float           rlistInner_sq;        /**< Inner, dynamic pruned pair-list cut-off squared   */
 181     bool            useDynamicPruning;    /**< True if we use dynamic pair-list pruning          */
 182
 183     shift_consts_t  dispersion_shift;     /**< VdW shift dispersion constants           */
 184     shift_consts_t  repulsion_shift;      /**< VdW shift repulsion constants            */
 185     switch_consts_t vdw_switch;           /**< VdW switch constants                     */
 186
 187     /* LJ non-bonded parameters - accessed through texture memory */
 188     float               *nbfp;             /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
 189     cudaTextureObject_t  nbfp_texobj;      /**< texture object bound to nbfp                                                       */
 190     float               *nbfp_comb;        /**< nonbonded parameter table per atom type, 2*ntype elements                          */
 191     cudaTextureObject_t  nbfp_comb_texobj; /**< texture object bound to nbfp_texobj                                                */
 192
 193     /* Ewald Coulomb force table data - accessed through texture memory */
 194     float                coulomb_tab_scale;  /**< table scale/spacing                        */
 195     float               *coulomb_tab;        /**< pointer to the table in the device memory  */
 196     cudaTextureObject_t  coulomb_tab_texobj; /**< texture object bound to coulomb_tab        */
 197 };
 198
 199 /** \internal
 200  * \brief Pair list data.
 201  */
 202 using cu_plist_t = Nbnxm::gpu_plist;
 203
 204 /** \internal
 205  * \brief Typedef of actual timer type.
 206  */
 207 typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 208
 209 class GpuEventSynchronizer;
 210
 211 /** \internal
 212  * \brief Main data structure for CUDA nonbonded force calculations.
 213  */
 214 struct gmx_nbnxn_cuda_t
 215 {
 216     //! CUDA device information
 217     const gmx_device_info_t                                        *dev_info;
 218     //! true if doing both local/non-local NB work on GPU
 219     bool                                                            bUseTwoStreams;
 220     //! atom data
 221     cu_atomdata_t                                                  *atdat;
 222     //! coordinates in rvec format
 223     rvec                                                           *xrvec;
 224     //! number of atoms
 225     int                                                             natoms;
 226     //! number of atoms allocated in device buffer
 227     int                                                             natoms_alloc;
 228     //! force in rvec format
 229     rvec                                                           *frvec;
 230     //! number of atoms in force buffer
 231     int                                                             nfrvec;
 232     //! number of atoms allocated in force buffer
 233     int                                                             nfrvec_alloc;
 234     //! f buf ops cell index mapping
 235     int                                                            *cell;
 236     //! number of indices in cell buffer
 237     int                                                             ncell;
 238     //! number of indices allocated in cell buffer
 239     int                                                             ncell_alloc;
 240     //! array of atom indices
 241     int                                                            *atomIndices;
 242     //! size of atom indices
 243     int                                                             atomIndicesSize;
 244     //! size of atom indices allocated in device buffer
 245     int                                                             atomIndicesSize_alloc;
 246     //! x buf ops num of atoms
 247     int                                                            *cxy_na;
 248     //! number of elements in cxy_na
 249     int                                                             ncxy_na;
 250     //! number of elements allocated allocated in device buffer
 251     int                                                             ncxy_na_alloc;
 252     //! x buf ops cell index mapping
 253     int                                                            *cxy_ind;
 254     //! number of elements in cxy_ind
 255     int                                                             ncxy_ind;
 256     //! number of elements allocated allocated in device buffer
 257     int                                                             ncxy_ind_alloc;
 258     //! parameters required for the non-bonded calc.
 259     cu_nbparam_t                                                   *nbparam;
 260     //! pair-list data structures (local and non-local)
 261     gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist;
 262     //! staging area where fshift/energies get downloaded
 263     nb_staging_t                                                    nbst;
 264     //! local and non-local GPU streams
 265     gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream;
 266
 267     /** events used for synchronization */
 268     cudaEvent_t    nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
 269                                                    is done (and the local transfer can proceed)           */
 270     cudaEvent_t    misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
 271                                                    the local stream that need to precede the
 272                                                    non-local force or buffer operation calculations are done
 273                                                    (e.g. f buffer 0-ing, local x/q H2D, buffer op
 274                                                    initialization in local stream that is required also
 275                                                    by nonlocal stream ) */
 276
 277     //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
 278     //  to be executed in the current domain. As long as bonded work is not split up into
 279     //  local/nonlocal, if there is bonded GPU work, both flags will be true.
 280     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
 281
 282
 283     GpuEventSynchronizer *xAvailableOnDevice;   /**< event triggered when
 284                                                    coordinate buffer has been
 285                                                    copied to device by PP task and
 286                                                    any dependent task (e.g. transfer of coordinates
 287                                                    to the PME rank's GPU) can proceed. */
 288
 289     GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
 290                                                    non-local coordinate buffer has been
 291                                                    copied from device to host*/
 292
 293     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
 294      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
 295      * Timer init/uninit is still done even with timing off so only the condition
 296      * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
 297     //! True if event-based timing is enabled.
 298     bool                       bDoTime;
 299     //! CUDA event-based timers.
 300     cu_timers_t               *timers;
 301     //! Timing data. TODO: deprecate this and query timers for accumulated data instead
 302     gmx_wallclock_gpu_nbnxn_t *timings;
 303 };
 304
 305 #endif  /* NBNXN_CUDA_TYPES_H */