Add common header to GpuEventSynchronizer
[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 /*! \file
37  *  \brief Define CUDA implementation of nbnxn_gpu.h
38  *
39  *  \author Szilard Pall <pall.szilard@gmail.com>
40  */
41 #include "gmxpre.h"
42
43 #include "config.h"
44
45 #include <assert.h>
46 #include <stdlib.h>
47
48 #include "gromacs/nbnxm/nbnxm_gpu.h"
49
50 #if defined(_MSVC)
51 #    include <limits>
52 #endif
53
54
55 #include "nbnxm_cuda.h"
56
57 #include "gromacs/gpu_utils/gpu_utils.h"
58 #include "gromacs/gpu_utils/gpueventsynchronizer.h"
59 #include "gromacs/gpu_utils/typecasts.cuh"
60 #include "gromacs/gpu_utils/vectype_ops.cuh"
61 #include "gromacs/hardware/device_information.h"
62 #include "gromacs/mdtypes/simulation_workload.h"
63 #include "gromacs/nbnxm/atomdata.h"
64 #include "gromacs/nbnxm/gpu_common.h"
65 #include "gromacs/nbnxm/gpu_common_utils.h"
66 #include "gromacs/nbnxm/gpu_data_mgmt.h"
67 #include "gromacs/nbnxm/grid.h"
68 #include "gromacs/nbnxm/nbnxm.h"
69 #include "gromacs/nbnxm/pairlist.h"
70 #include "gromacs/timing/gpu_timing.h"
71 #include "gromacs/utility/cstringutil.h"
72 #include "gromacs/utility/gmxassert.h"
73
74 #include "nbnxm_cuda_types.h"
75
76 /***** The kernel declarations/definitions come here *****/
77
78 /* Top-level kernel declaration generation: will generate through multiple
79  * inclusion the following flavors for all kernel declarations:
80  * - force-only output;
81  * - force and energy output;
82  * - force-only with pair list pruning;
83  * - force and energy output with pair list pruning.
84  */
85 #define FUNCTION_DECLARATION_ONLY
86 /** Force only **/
87 #include "nbnxm_cuda_kernels.cuh"
88 /** Force & energy **/
89 #define CALC_ENERGIES
90 #include "nbnxm_cuda_kernels.cuh"
91 #undef CALC_ENERGIES
92
93 /*** Pair-list pruning kernels ***/
94 /** Force only **/
95 #define PRUNE_NBL
96 #include "nbnxm_cuda_kernels.cuh"
97 /** Force & energy **/
98 #define CALC_ENERGIES
99 #include "nbnxm_cuda_kernels.cuh"
100 #undef CALC_ENERGIES
101 #undef PRUNE_NBL
102
103 /* Prune-only kernels */
104 #include "nbnxm_cuda_kernel_pruneonly.cuh"
105 #undef FUNCTION_DECLARATION_ONLY
106
107 /* Now generate the function definitions if we are using a single compilation unit. */
108 #if GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
109 #    include "nbnxm_cuda_kernel_F_noprune.cu"
110 #    include "nbnxm_cuda_kernel_F_prune.cu"
111 #    include "nbnxm_cuda_kernel_VF_noprune.cu"
112 #    include "nbnxm_cuda_kernel_VF_prune.cu"
113 #    include "nbnxm_cuda_kernel_pruneonly.cu"
114 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
115
116 namespace Nbnxm
117 {
118
119 /*! Nonbonded kernel function pointer type */
120 typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomDataGpu, const NBParamGpu, const gpu_plist, bool);
121
122 /*********************************/
123
124 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
125 static inline int calc_nb_kernel_nblock(int nwork_units, const DeviceInformation* deviceInfo)
126 {
127     int max_grid_x_size;
128
129     assert(deviceInfo);
130     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
131        empty domain) and that case should be handled before this point. */
132     assert(nwork_units > 0);
133
134     max_grid_x_size = deviceInfo->prop.maxGridSize[0];
135
136     /* do we exceed the grid x dimension limit? */
137     if (nwork_units > max_grid_x_size)
138     {
139         gmx_fatal(FARGS,
140                   "Watch out, the input system is too large to simulate!\n"
141                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
142                   "maximum grid size in x dimension (%d > %d)!",
143                   nwork_units,
144                   max_grid_x_size);
145     }
146
147     return nwork_units;
148 }
149
150
151 /* Constant arrays listing all kernel function pointers and enabling selection
152    of a kernel in an elegant manner. */
153
154 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
155  *  electrostatics and VDW type.
156  *
157  *  Note that the row- and column-order of function pointers has to match the
158  *  order of corresponding enumerated electrostatics and vdw types, resp.,
159  *  defined in nbnxn_cuda_types.h.
160  */
161
162 /*! Force-only kernel function pointers. */
163 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
164     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,
165       nbnxn_kernel_ElecCut_VdwLJCombGeom_F_cuda,
166       nbnxn_kernel_ElecCut_VdwLJCombLB_F_cuda,
167       nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,
168       nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,
169       nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,
170       nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda },
171     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,
172       nbnxn_kernel_ElecRF_VdwLJCombGeom_F_cuda,
173       nbnxn_kernel_ElecRF_VdwLJCombLB_F_cuda,
174       nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,
175       nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,
176       nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,
177       nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda },
178     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,
179       nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_cuda,
180       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_cuda,
181       nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,
182       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,
183       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,
184       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda },
185     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda,
186       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_cuda,
187       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_cuda,
188       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda,
189       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda,
190       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda,
191       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
192     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,
193       nbnxn_kernel_ElecEw_VdwLJCombGeom_F_cuda,
194       nbnxn_kernel_ElecEw_VdwLJCombLB_F_cuda,
195       nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,
196       nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,
197       nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,
198       nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda },
199     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,
200       nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_cuda,
201       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_cuda,
202       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,
203       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,
204       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,
205       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda }
206 };
207
208 /*! Force + energy kernel function pointers. */
209 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
210     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,
211       nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_cuda,
212       nbnxn_kernel_ElecCut_VdwLJCombLB_VF_cuda,
213       nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,
214       nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,
215       nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,
216       nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda },
217     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,
218       nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_cuda,
219       nbnxn_kernel_ElecRF_VdwLJCombLB_VF_cuda,
220       nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,
221       nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,
222       nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,
223       nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda },
224     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,
225       nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_cuda,
226       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_cuda,
227       nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,
228       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,
229       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,
230       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda },
231     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda,
232       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_cuda,
233       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_cuda,
234       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda,
235       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda,
236       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda,
237       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
238     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,
239       nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_cuda,
240       nbnxn_kernel_ElecEw_VdwLJCombLB_VF_cuda,
241       nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,
242       nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,
243       nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,
244       nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda },
245     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,
246       nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_cuda,
247       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_cuda,
248       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,
249       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,
250       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,
251       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda }
252 };
253
254 /*! Force + pruning kernel function pointers. */
255 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
256     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,
257       nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_cuda,
258       nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_cuda,
259       nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,
260       nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,
261       nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,
262       nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda },
263     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,
264       nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_cuda,
265       nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_cuda,
266       nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,
267       nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,
268       nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,
269       nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda },
270     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,
271       nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_cuda,
272       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_cuda,
273       nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,
274       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,
275       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,
276       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda },
277     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,
278       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_cuda,
279       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_cuda,
280       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda,
281       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda,
282       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda,
283       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
284     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,
285       nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_cuda,
286       nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_cuda,
287       nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,
288       nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,
289       nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,
290       nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda },
291     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,
292       nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_cuda,
293       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_cuda,
294       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,
295       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,
296       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,
297       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda }
298 };
299
300 /*! Force + energy + pruning kernel function pointers. */
301 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
302     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,
303       nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_cuda,
304       nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_cuda,
305       nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,
306       nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,
307       nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,
308       nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda },
309     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,
310       nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_cuda,
311       nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_cuda,
312       nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,
313       nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,
314       nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,
315       nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda },
316     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,
317       nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_cuda,
318       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_cuda,
319       nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,
320       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,
321       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,
322       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda },
323     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda,
324       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_cuda,
325       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_cuda,
326       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda,
327       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda,
328       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda,
329       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
330     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,
331       nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_cuda,
332       nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_cuda,
333       nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,
334       nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,
335       nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,
336       nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda },
337     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,
338       nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_cuda,
339       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_cuda,
340       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,
341       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,
342       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,
343       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda }
344 };
345
346 /*! Return a pointer to the kernel version to be executed at the current step. */
347 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(enum ElecType           elecType,
348                                                        enum VdwType            vdwType,
349                                                        bool                    bDoEne,
350                                                        bool                    bDoPrune,
351                                                        const DeviceInformation gmx_unused* deviceInfo)
352 {
353     const int elecTypeIdx = static_cast<int>(elecType);
354     const int vdwTypeIdx  = static_cast<int>(vdwType);
355
356     GMX_ASSERT(elecTypeIdx < c_numElecTypes,
357                "The electrostatics type requested is not implemented in the CUDA kernels.");
358     GMX_ASSERT(vdwTypeIdx < c_numVdwTypes,
359                "The VdW type requested is not implemented in the CUDA kernels.");
360
361     /* assert assumptions made by the kernels */
362     GMX_ASSERT(c_nbnxnGpuClusterSize * c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit
363                        == deviceInfo->prop.warpSize,
364                "The CUDA kernels require the "
365                "cluster_size_i*cluster_size_j/nbnxn_gpu_clusterpair_split to match the warp size "
366                "of the architecture targeted.");
367
368     if (bDoEne)
369     {
370         if (bDoPrune)
371         {
372             return nb_kfunc_ener_prune_ptr[elecTypeIdx][vdwTypeIdx];
373         }
374         else
375         {
376             return nb_kfunc_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
377         }
378     }
379     else
380     {
381         if (bDoPrune)
382         {
383             return nb_kfunc_noener_prune_ptr[elecTypeIdx][vdwTypeIdx];
384         }
385         else
386         {
387             return nb_kfunc_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
388         }
389     }
390 }
391
392 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use. */
393 static inline int calc_shmem_required_nonbonded(const int               num_threads_z,
394                                                 const DeviceInformation gmx_unused* deviceInfo,
395                                                 const NBParamGpu*                   nbp)
396 {
397     int shmem;
398
399     assert(deviceInfo);
400
401     /* size of shmem (force-buffers/xq/atom type preloading) */
402     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
403     /* i-atom x+q in shared memory */
404     shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
405     /* cj in shared memory, for each warp separately */
406     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
407
408     if (nbp->vdwType == VdwType::CutCombGeom || nbp->vdwType == VdwType::CutCombLB)
409     {
410         /* i-atom LJ combination parameters in shared memory */
411         shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float2);
412     }
413     else
414     {
415         /* i-atom types in shared memory */
416         shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int);
417     }
418
419     return shmem;
420 }
421
422 /*! As we execute nonbonded workload in separate streams, before launching
423    the kernel we need to make sure that he following operations have completed:
424    - atomdata allocation and related H2D transfers (every nstlist step);
425    - pair list H2D transfer (every nstlist step);
426    - shift vector H2D transfer (every nstlist step);
427    - force (+shift force and energy) output clearing (every step).
428
429    These operations are issued in the local stream at the beginning of the step
430    and therefore always complete before the local kernel launch. The non-local
431    kernel is launched after the local on the same device/context hence it is
432    inherently scheduled after the operations in the local stream (including the
433    above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
434    devices with multiple hardware queues the dependency needs to be enforced.
435    We use the misc_ops_and_local_H2D_done event to record the point where
436    the local x+q H2D (and all preceding) tasks are complete and synchronize
437    with this event in the non-local stream before launching the non-bonded kernel.
438  */
439 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
440 {
441     NBAtomDataGpu*      adat         = nb->atdat;
442     NBParamGpu*         nbp          = nb->nbparam;
443     gpu_plist*          plist        = nb->plist[iloc];
444     Nbnxm::GpuTimers*   timers       = nb->timers;
445     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
446
447     bool bDoTime = nb->bDoTime;
448
449     /* Don't launch the non-local kernel if there is no work to do.
450        Doing the same for the local kernel is more complicated, since the
451        local part of the force array also depends on the non-local kernel.
452        So to avoid complicating the code and to reduce the risk of bugs,
453        we always call the local kernel, and later (not in
454        this function) the stream wait, local f copyback and the f buffer
455        clearing. All these operations, except for the local interaction kernel,
456        are needed for the non-local interactions. The skip of the local kernel
457        call is taken care of later in this function. */
458     if (canSkipNonbondedWork(*nb, iloc))
459     {
460         plist->haveFreshList = false;
461
462         return;
463     }
464
465     if (nbp->useDynamicPruning && plist->haveFreshList)
466     {
467         /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
468            (TODO: ATM that's the way the timing accounting can distinguish between
469            separate prune kernel and combined force+prune, maybe we need a better way?).
470          */
471         gpu_launch_kernel_pruneonly(nb, iloc, 1);
472     }
473
474     if (plist->nsci == 0)
475     {
476         /* Don't launch an empty local kernel (not allowed with CUDA) */
477         return;
478     }
479
480     /* beginning of timed nonbonded calculation section */
481     if (bDoTime)
482     {
483         timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
484     }
485
486     /* Kernel launch config:
487      * - The thread block dimensions match the size of i-clusters, j-clusters,
488      *   and j-cluster concurrency, in x, y, and z, respectively.
489      * - The 1D block-grid contains as many blocks as super-clusters.
490      */
491     int num_threads_z = 1;
492     if (nb->deviceContext_->deviceInfo().prop.major == 3 && nb->deviceContext_->deviceInfo().prop.minor == 7)
493     {
494         num_threads_z = 2;
495     }
496     int nblock = calc_nb_kernel_nblock(plist->nsci, &nb->deviceContext_->deviceInfo());
497
498
499     KernelLaunchConfig config;
500     config.blockSize[0] = c_clSize;
501     config.blockSize[1] = c_clSize;
502     config.blockSize[2] = num_threads_z;
503     config.gridSize[0]  = nblock;
504     config.sharedMemorySize =
505             calc_shmem_required_nonbonded(num_threads_z, &nb->deviceContext_->deviceInfo(), nbp);
506
507     if (debug)
508     {
509         fprintf(debug,
510                 "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
511                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
512                 "\tShMem: %zu\n",
513                 config.blockSize[0],
514                 config.blockSize[1],
515                 config.blockSize[2],
516                 config.gridSize[0],
517                 config.gridSize[1],
518                 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
519                 c_nbnxnGpuNumClusterPerSupercluster,
520                 plist->na_c,
521                 config.sharedMemorySize);
522     }
523
524     auto*      timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
525     const auto kernel =
526             select_nbnxn_kernel(nbp->elecType,
527                                 nbp->vdwType,
528                                 stepWork.computeEnergy,
529                                 (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
530                                 &nb->deviceContext_->deviceInfo());
531     const auto kernelArgs =
532             prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial);
533     launchGpuKernel(kernel, config, deviceStream, timingEvent, "k_calc_nb", kernelArgs);
534
535     if (bDoTime)
536     {
537         timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
538     }
539
540     if (GMX_NATIVE_WINDOWS)
541     {
542         /* Windows: force flushing WDDM queue */
543         cudaStreamQuery(deviceStream.stream());
544     }
545 }
546
547 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
548 static inline int calc_shmem_required_prune(const int num_threads_z)
549 {
550     int shmem;
551
552     /* i-atom x in shared memory */
553     shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
554     /* cj in shared memory, for each warp separately */
555     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
556
557     return shmem;
558 }
559
560 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
561 {
562     NBAtomDataGpu*      adat         = nb->atdat;
563     NBParamGpu*         nbp          = nb->nbparam;
564     gpu_plist*          plist        = nb->plist[iloc];
565     Nbnxm::GpuTimers*   timers       = nb->timers;
566     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
567
568     bool bDoTime = nb->bDoTime;
569
570     if (plist->haveFreshList)
571     {
572         GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
573
574         /* Set rollingPruningNumParts to signal that it is not set */
575         plist->rollingPruningNumParts = 0;
576         plist->rollingPruningPart     = 0;
577     }
578     else
579     {
580         if (plist->rollingPruningNumParts == 0)
581         {
582             plist->rollingPruningNumParts = numParts;
583         }
584         else
585         {
586             GMX_ASSERT(numParts == plist->rollingPruningNumParts,
587                        "It is not allowed to change numParts in between list generation steps");
588         }
589     }
590
591     /* Use a local variable for part and update in plist, so we can return here
592      * without duplicating the part increment code.
593      */
594     int part = plist->rollingPruningPart;
595
596     plist->rollingPruningPart++;
597     if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
598     {
599         plist->rollingPruningPart = 0;
600     }
601
602     /* Compute the number of list entries to prune in this pass */
603     int numSciInPart = (plist->nsci - part) / numParts;
604
605     /* Don't launch the kernel if there is no work to do (not allowed with CUDA) */
606     if (numSciInPart <= 0)
607     {
608         plist->haveFreshList = false;
609
610         return;
611     }
612
613     GpuRegionTimer* timer = nullptr;
614     if (bDoTime)
615     {
616         timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
617                                        : timers->interaction[iloc].rollingPrune_k);
618     }
619
620     /* beginning of timed prune calculation section */
621     if (bDoTime)
622     {
623         timer->openTimingRegion(deviceStream);
624     }
625
626     /* Kernel launch config:
627      * - The thread block dimensions match the size of i-clusters, j-clusters,
628      *   and j-cluster concurrency, in x, y, and z, respectively.
629      * - The 1D block-grid contains as many blocks as super-clusters.
630      */
631     int num_threads_z = c_pruneKernelJ4Concurrency;
632     int nblock        = calc_nb_kernel_nblock(numSciInPart, &nb->deviceContext_->deviceInfo());
633     KernelLaunchConfig config;
634     config.blockSize[0]     = c_clSize;
635     config.blockSize[1]     = c_clSize;
636     config.blockSize[2]     = num_threads_z;
637     config.gridSize[0]      = nblock;
638     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
639
640     if (debug)
641     {
642         fprintf(debug,
643                 "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
644                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
645                 "\tShMem: %zu\n",
646                 config.blockSize[0],
647                 config.blockSize[1],
648                 config.blockSize[2],
649                 config.gridSize[0],
650                 config.gridSize[1],
651                 numSciInPart * c_nbnxnGpuNumClusterPerSupercluster,
652                 c_nbnxnGpuNumClusterPerSupercluster,
653                 plist->na_c,
654                 config.sharedMemorySize);
655     }
656
657     auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
658     constexpr char kernelName[] = "k_pruneonly";
659     const auto     kernel =
660             plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
661     const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
662     launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
663
664     /* TODO: consider a more elegant way to track which kernel has been called
665        (combined or separate 1st pass prune, rolling prune). */
666     if (plist->haveFreshList)
667     {
668         plist->haveFreshList = false;
669         /* Mark that pruning has been done */
670         nb->timers->interaction[iloc].didPrune = true;
671     }
672     else
673     {
674         /* Mark that rolling pruning has been done */
675         nb->timers->interaction[iloc].didRollingPrune = true;
676     }
677
678     if (bDoTime)
679     {
680         timer->closeTimingRegion(deviceStream);
681     }
682
683     if (GMX_NATIVE_WINDOWS)
684     {
685         /* Windows: force flushing WDDM queue */
686         cudaStreamQuery(deviceStream.stream());
687     }
688 }
689
690 void cuda_set_cacheconfig()
691 {
692     cudaError_t stat;
693
694     for (int i = 0; i < c_numElecTypes; i++)
695     {
696         for (int j = 0; j < c_numVdwTypes; j++)
697         {
698             /* Default kernel 32/32 kB Shared/L1 */
699             cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
700             cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
701             cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
702             stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
703             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
704         }
705     }
706 }
707
708 } // namespace Nbnxm