2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team.
6 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
40 * Data types used internally in the nbnxn_cuda module.
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
46 #ifndef NBNXM_CUDA_TYPES_H
47 #define NBNXM_CUDA_TYPES_H
49 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
50 #include "gromacs/gpu_utils/cudautils.cuh"
51 #include "gromacs/gpu_utils/devicebuffer.h"
52 #include "gromacs/gpu_utils/gputraits.cuh"
53 #include "gromacs/mdtypes/interaction_const.h"
54 #include "gromacs/nbnxm/gpu_types_common.h"
55 #include "gromacs/nbnxm/nbnxm.h"
56 #include "gromacs/nbnxm/pairlist.h"
57 #include "gromacs/timing/gpu_timing.h"
58 #include "gromacs/utility/enumerationhelpers.h"
60 /*! \brief Macro definining default for the prune kernel's j4 processing concurrency.
62 * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
64 #ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
65 #define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
67 /*! \brief Default for the prune kernel's j4 processing concurrency.
69 * Initialized using the #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro which allows compile-time override.
71 const int c_cudaPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
73 /* TODO: consider moving this to kernel_utils */
74 /* Convenience defines */
75 /*! \brief number of clusters per supercluster. */
76 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
77 /*! \brief cluster size = number of atoms per cluster. */
78 static const int c_clSize = c_nbnxnGpuClusterSize;
80 /*! \brief Electrostatic CUDA kernel flavors.
82 * Types of electrostatics implementations available in the CUDA non-bonded
83 * force kernels. These represent both the electrostatics types implemented
84 * by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
85 * enums.h) as well as encode implementation details analytical/tabulated
86 * and single or twin cut-off (for Ewald kernels).
87 * Note that the cut-off and RF kernels have only analytical flavor and unlike
88 * in the CPU kernels, the tabulated kernels are ATM Ewald-only.
90 * The row-order of pointers to different electrostatic kernels defined in
91 * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
92 * should match the order of enumerated types below.
95 eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
98 /*! \brief VdW CUDA kernel flavors.
100 * The enumerates values correspond to the LJ implementations in the CUDA non-bonded
103 * The column-order of pointers to different electrostatic kernels defined in
104 * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
105 * should match the order of enumerated types below.
108 evdwCuCUT, evdwCuCUTCOMBGEOM, evdwCuCUTCOMBLB, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
111 /* All structs prefixed with "cu_" hold data used in GPU calculations and
112 * are passed to the kernels, except cu_timers_t. */
114 typedef struct cu_atomdata cu_atomdata_t;
115 typedef struct cu_nbparam cu_nbparam_t;
116 typedef struct nb_staging nb_staging_t;
121 * \brief Staging area for temporary data downloaded from the GPU.
123 * The energies/shift forces get downloaded here first, before getting added
124 * to the CPU-side aggregate values.
128 float *e_lj; /**< LJ energy */
129 float *e_el; /**< electrostatic energy */
130 float3 *fshift; /**< shift forces */
134 * \brief Nonbonded atom data - both inputs and outputs.
138 int natoms; /**< number of atoms */
139 int natoms_local; /**< number of local atoms */
140 int nalloc; /**< allocation size for the atom data (xq, f) */
142 float4 *xq; /**< atom coordinates + charges, size natoms */
143 float3 *f; /**< force output array, size natoms */
145 float *e_lj; /**< LJ energy output, size 1 */
146 float *e_el; /**< Electrostatics energy input, size 1 */
148 float3 *fshift; /**< shift forces */
150 int ntypes; /**< number of atom types */
151 int *atom_types; /**< atom type indices, size natoms */
152 float2 *lj_comb; /**< sqrt(c6),sqrt(c12) size natoms */
154 float3 *shift_vec; /**< shifts */
155 bool bShiftVecUploaded; /**< true if the shift vector has been uploaded */
159 * \brief Parameters required for the CUDA nonbonded calculations.
164 int eeltype; /**< type of electrostatics, takes values from #eelCu */
165 int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
167 float epsfac; /**< charge multiplication factor */
168 float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
169 float two_k_rf; /**< Reaction-field electrostatics constant */
170 float ewald_beta; /**< Ewald/PME parameter */
171 float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
172 float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
173 float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
175 float rcoulomb_sq; /**< Coulomb cut-off squared */
177 float rvdw_sq; /**< VdW cut-off squared */
178 float rvdw_switch; /**< VdW switched cut-off */
179 float rlistOuter_sq; /**< Full, outer pair-list cut-off squared */
180 float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared */
181 bool useDynamicPruning; /**< True if we use dynamic pair-list pruning */
183 shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
184 shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
185 switch_consts_t vdw_switch; /**< VdW switch constants */
187 /* LJ non-bonded parameters - accessed through texture memory */
188 float *nbfp; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
189 cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */
190 float *nbfp_comb; /**< nonbonded parameter table per atom type, 2*ntype elements */
191 cudaTextureObject_t nbfp_comb_texobj; /**< texture object bound to nbfp_texobj */
193 /* Ewald Coulomb force table data - accessed through texture memory */
194 float coulomb_tab_scale; /**< table scale/spacing */
195 float *coulomb_tab; /**< pointer to the table in the device memory */
196 cudaTextureObject_t coulomb_tab_texobj; /**< texture object bound to coulomb_tab */
200 * \brief Pair list data.
202 using cu_plist_t = Nbnxm::gpu_plist;
205 * \brief Typedef of actual timer type.
207 typedef struct Nbnxm::gpu_timers_t cu_timers_t;
209 class GpuEventSynchronizer;
212 * \brief Main data structure for CUDA nonbonded force calculations.
214 struct gmx_nbnxn_cuda_t
216 //! CUDA device information
217 const gmx_device_info_t *dev_info;
218 //! true if doing both local/non-local NB work on GPU
221 cu_atomdata_t *atdat;
222 //! coordinates in rvec format
226 //! number of atoms allocated in device buffer
228 //! force in rvec format
230 //! number of atoms in force buffer
232 //! number of atoms allocated in force buffer
234 //! f buf ops cell index mapping
236 //! number of indices in cell buffer
238 //! number of indices allocated in cell buffer
240 //! array of atom indices
242 //! size of atom indices
244 //! size of atom indices allocated in device buffer
245 int atomIndicesSize_alloc;
246 //! x buf ops num of atoms
248 //! number of elements in cxy_na
250 //! number of elements allocated allocated in device buffer
252 //! x buf ops cell index mapping
254 //! number of elements in cxy_ind
256 //! number of elements allocated allocated in device buffer
258 //! parameters required for the non-bonded calc.
259 cu_nbparam_t *nbparam;
260 //! pair-list data structures (local and non-local)
261 gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist;
262 //! staging area where fshift/energies get downloaded
264 //! local and non-local GPU streams
265 gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream;
267 /** events used for synchronization */
268 cudaEvent_t nonlocal_done; /**< event triggered when the non-local non-bonded kernel
269 is done (and the local transfer can proceed) */
270 cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
271 the local stream that need to precede the
272 non-local force or buffer operation calculations are done
273 (e.g. f buffer 0-ing, local x/q H2D, buffer op
274 initialization in local stream that is required also
275 by nonlocal stream ) */
277 //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
278 // to be executed in the current domain. As long as bonded work is not split up into
279 // local/nonlocal, if there is bonded GPU work, both flags will be true.
280 gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
283 GpuEventSynchronizer *xAvailableOnDevice; /**< event triggered when
284 coordinate buffer has been
285 copied to device by PP task and
286 any dependent task (e.g. transfer of coordinates
287 to the PME rank's GPU) can proceed. */
289 GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
290 non-local coordinate buffer has been
291 copied from device to host*/
293 /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
294 * concurrent streams, so we won't time if both l/nl work is done on GPUs.
295 * Timer init/uninit is still done even with timing off so only the condition
296 * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
297 //! True if event-based timing is enabled.
299 //! CUDA event-based timers.
301 //! Timing data. TODO: deprecate this and query timers for accumulated data instead
302 gmx_wallclock_gpu_nbnxn_t *timings;
305 #endif /* NBNXN_CUDA_TYPES_H */