2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define OpenCL implementation of nbnxn_gpu.h
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
39 * \author Teemu Virolainen <teemu@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Szilárd Páll <pall.szilard@gmail.com>
42 * \ingroup module_mdlib
45 * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
46 * - Rework the copying of OCL data structures done before every invocation of both
47 * nb and prune kernels (using fillin_ocl_structures); also consider at the same
48 * time calling clSetKernelArg only on the updated parameters (if tracking changed
49 * parameters is feasible);
50 * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
51 * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
52 * on the misc_ops_and_local_H2D_done event could be better expressed this way.
54 * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
55 * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
56 * is identical in the two implementations, so a 3-way split might allow sharing
70 #include "thread_mpi/atomic.h"
72 #include "gromacs/gpu_utils/oclutils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/mdlib/force_flags.h"
75 #include "gromacs/mdlib/nb_verlet.h"
76 #include "gromacs/mdlib/nbnxn_consts.h"
77 #include "gromacs/mdlib/nbnxn_gpu.h"
78 #include "gromacs/mdlib/nbnxn_gpu_common.h"
79 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
80 #include "gromacs/mdlib/nbnxn_pairlist.h"
81 #include "gromacs/pbcutil/ishift.h"
82 #include "gromacs/timing/gpu_timing.h"
83 #include "gromacs/utility/cstringutil.h"
84 #include "gromacs/utility/fatalerror.h"
85 #include "gromacs/utility/gmxassert.h"
87 #include "nbnxn_ocl_internal.h"
88 #include "nbnxn_ocl_types.h"
90 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
94 /*! \brief Convenience constants */
96 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
97 static const int c_clSize = c_nbnxnGpuClusterSize;
100 /*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
102 static bool always_ener = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
103 static bool never_ener = (getenv("GMX_GPU_NEVER_ENER") != NULL);
104 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
107 /* Uncomment this define to enable kernel debugging */
110 /*! \brief Specifies which kernel run to debug */
111 #define DEBUG_RUN_STEP 2
113 /*! \brief Validates the input global work size parameter.
115 static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
117 cl_uint device_size_t_size_bits;
118 cl_uint host_size_t_size_bits;
122 /* Each component of a global_work_size must not exceed the range given by the
123 sizeof(device size_t) for the device on which the kernel execution will
125 https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
127 device_size_t_size_bits = dinfo->adress_bits;
128 host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8);
130 /* If sizeof(host size_t) <= sizeof(device size_t)
131 => global_work_size components will always be valid
133 => get device limit for global work size and
134 compare it against each component of global_work_size.
136 if (host_size_t_size_bits > device_size_t_size_bits)
140 device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
142 for (int i = 0; i < work_dim; i++)
144 if (global_work_size[i] > device_limit)
146 gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
147 "The number of nonbonded work units (=number of super-clusters) exceeds the"
148 "device capabilities. Global work size limit exceeded (%d > %d)!",
149 global_work_size[i], device_limit);
155 /* Constant arrays listing non-bonded kernel function names. The arrays are
156 * organized in 2-dim arrays by: electrostatics and VDW type.
158 * Note that the row- and column-order of function pointers has to match the
159 * order of corresponding enumerated electrostatics and vdw types, resp.,
160 * defined in nbnxn_cuda_types.h.
163 /*! \brief Force-only kernel function names. */
164 static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
166 { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
167 { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
168 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
169 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
170 { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
171 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
174 /*! \brief Force + energy kernel function pointers. */
175 static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
177 { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
178 { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
179 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
180 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
181 { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
182 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
185 /*! \brief Force + pruning kernel function pointers. */
186 static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
188 { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
189 { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
190 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
191 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
192 { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
193 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
196 /*! \brief Force + energy + pruning kernel function pointers. */
197 static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
199 { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
200 { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
201 { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
202 { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
203 { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
204 { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
207 /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
209 * \param[in] kernel_pruneonly array of prune kernel objects
210 * \param[in] firstPrunePass true if the first pruning pass is being executed
212 static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
215 cl_kernel *kernelPtr;
219 kernelPtr = &(kernel_pruneonly[epruneFirst]);
223 kernelPtr = &(kernel_pruneonly[epruneRolling]);
225 // TODO: consider creating the prune kernel object here to avoid a
226 // clCreateKernel for the rolling prune kernel if this is not needed.
230 /*! \brief Return a pointer to the kernel version to be executed at the current step.
231 * OpenCL kernel objects are cached in nb. If the requested kernel is not
232 * found in the cache, it will be created and the cache will be updated.
234 static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
240 const char* kernel_name_to_run;
241 cl_kernel *kernel_ptr;
244 assert(eeltype < eelOclNR);
245 assert(evdwtype < evdwOclNR);
251 kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
252 kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
256 kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
257 kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
264 kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
265 kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
269 kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
270 kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
274 if (NULL == kernel_ptr[0])
276 *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
277 assert(cl_error == CL_SUCCESS);
279 // TODO: handle errors
284 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
286 static inline int calc_shmem_required_nonbonded(int vdwType,
287 bool bPrefetchLjParam)
291 /* size of shmem (force-buffers/xq/atom type preloading) */
292 /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
293 /* i-atom x+q in shared memory */
294 shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
295 /* cj in shared memory, for both warps separately */
296 shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
297 if (bPrefetchLjParam)
299 if (useLjCombRule(vdwType))
301 /* i-atom LJ combination parameters in shared memory */
302 shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
306 /* i-atom types in shared memory */
307 shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
310 /* force reduction buffers in shared memory */
311 shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
312 /* Warp vote. In fact it must be * number of warps in block.. */
313 shmem += sizeof(cl_uint) * 2; /* warp_any */
317 /*! \brief Initializes data structures that are going to be sent to the OpenCL device.
319 * The device can't use the same data structures as the host for two main reasons:
320 * - OpenCL restrictions (pointers are not accepted inside data structures)
321 * - some host side fields are not needed for the OpenCL kernels.
323 * This function is called before the launch of both nbnxn and prune kernels.
325 static void fillin_ocl_structures(cl_nbparam_t *nbp,
326 cl_nbparam_params_t *nbparams_params)
328 nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
329 nbparams_params->c_rf = nbp->c_rf;
330 nbparams_params->dispersion_shift = nbp->dispersion_shift;
331 nbparams_params->eeltype = nbp->eeltype;
332 nbparams_params->epsfac = nbp->epsfac;
333 nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
334 nbparams_params->ewald_beta = nbp->ewald_beta;
335 nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
336 nbparams_params->repulsion_shift = nbp->repulsion_shift;
337 nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
338 nbparams_params->rvdw_sq = nbp->rvdw_sq;
339 nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
340 nbparams_params->rvdw_switch = nbp->rvdw_switch;
341 nbparams_params->sh_ewald = nbp->sh_ewald;
342 nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
343 nbparams_params->two_k_rf = nbp->two_k_rf;
344 nbparams_params->vdwtype = nbp->vdwtype;
345 nbparams_params->vdw_switch = nbp->vdw_switch;
348 /*! \brief Enqueues a wait for event completion.
350 * Then it releases the event and sets it to 0.
351 * Don't use this function when more than one wait will be issued for the event.
352 * Equivalent to Cuda Stream Sync. */
353 static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
355 cl_int gmx_unused cl_error;
358 #ifdef CL_VERSION_1_2
359 cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, NULL);
361 cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
364 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
366 /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
367 cl_error = clReleaseEvent(*ocl_event);
368 assert(CL_SUCCESS == cl_error);
372 /*! \brief Returns the duration in milliseconds for the command associated with the event.
374 * It then releases the event and sets it to 0.
375 * Before calling this function, make sure the command has finished either by
376 * calling clFinish or clWaitForEvents.
377 * The function returns 0.0 if the input event, *ocl_event, is 0.
378 * Don't use this function when more than one wait will be issued for the event.
380 static double ocl_event_elapsed_ms(cl_event *ocl_event)
382 cl_int gmx_unused cl_error;
383 cl_ulong start_ns, end_ns;
387 assert(NULL != ocl_event);
391 cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
392 sizeof(cl_ulong), &start_ns, NULL);
393 assert(CL_SUCCESS == cl_error);
395 cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
396 sizeof(cl_ulong), &end_ns, NULL);
397 assert(CL_SUCCESS == cl_error);
399 clReleaseEvent(*ocl_event);
402 elapsed_ms = (end_ns - start_ns) / 1000000.0;
408 /*! \brief Launch GPU kernel
410 As we execute nonbonded workload in separate queues, before launching
411 the kernel we need to make sure that he following operations have completed:
412 - atomdata allocation and related H2D transfers (every nstlist step);
413 - pair list H2D transfer (every nstlist step);
414 - shift vector H2D transfer (every nstlist step);
415 - force (+shift force and energy) output clearing (every step).
417 These operations are issued in the local queue at the beginning of the step
418 and therefore always complete before the local kernel launch. The non-local
419 kernel is launched after the local on the same device/context, so this is
420 inherently scheduled after the operations in the local stream (including the
422 However, for the sake of having a future-proof implementation, we use the
423 misc_ops_done event to record the point in time when the above operations
424 are finished and synchronize with this event in the non-local stream.
426 void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
427 const struct nbnxn_atomdata_t *nbatom,
432 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
433 /* OpenCL kernel launch-related stuff */
435 size_t local_work_size[3], global_work_size[3];
436 cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
438 cl_atomdata_t *adat = nb->atdat;
439 cl_nbparam_t *nbp = nb->nbparam;
440 cl_plist_t *plist = nb->plist[iloc];
441 cl_timers_t *t = nb->timers;
442 cl_command_queue stream = nb->stream[iloc];
444 bool bCalcEner = flags & GMX_FORCE_ENERGY;
445 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
446 bool bDoTime = nb->bDoTime;
449 cl_nbparam_params_t nbparams_params;
451 float * debug_buffer_h;
452 size_t debug_buffer_size;
455 /* turn energy calculation always on/off (for debugging/testing only) */
456 bCalcEner = (bCalcEner || always_ener) && !never_ener;
458 /* Don't launch the non-local kernel if there is no work to do.
459 Doing the same for the local kernel is more complicated, since the
460 local part of the force array also depends on the non-local kernel.
461 So to avoid complicating the code and to reduce the risk of bugs,
462 we always call the local kernel, the local x+q copy and later (not in
463 this function) the stream wait, local f copyback and the f buffer
464 clearing. All these operations, except for the local interaction kernel,
465 are needed for the non-local interactions. The skip of the local kernel
466 call is taken care of later in this function. */
467 if (canSkipWork(nb, iloc))
469 plist->haveFreshList = false;
474 /* calculate the atom data index range based on locality */
478 adat_len = adat->natoms_local;
482 adat_begin = adat->natoms_local;
483 adat_len = adat->natoms - adat->natoms_local;
486 /* beginning of timed HtoD section */
489 ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
490 adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
492 /* When we get here all misc operations issues in the local stream as well as
493 the local xq H2D are done,
494 so we record that in the local stream and wait for it in the nonlocal one. */
495 if (nb->bUseTwoStreams)
497 if (iloc == eintLocal)
499 #ifdef CL_VERSION_1_2
500 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
502 cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
504 assert(CL_SUCCESS == cl_error);
506 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
507 * in the local stream in order to be able to sync with the above event
508 * from the non-local stream.
510 cl_error = clFlush(stream);
511 assert(CL_SUCCESS == cl_error);
515 sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
519 if (nbp->useDynamicPruning && plist->haveFreshList)
521 /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
522 (that's the way the timing accounting can distinguish between
523 separate prune kernel and combined force+prune).
525 nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
528 if (plist->nsci == 0)
530 /* Don't launch an empty local kernel (is not allowed with OpenCL).
531 * TODO: Separate H2D and kernel launch into separate functions.
536 /* beginning of timed nonbonded calculation section */
538 /* get the pointer to the kernel flavor we need to use */
539 nb_kernel = select_nbnxn_kernel(nb,
543 (plist->haveFreshList && !nb->timers->didPrune[iloc]) || always_prune);
545 /* kernel launch config */
546 local_work_size[0] = c_clSize;
547 local_work_size[1] = c_clSize;
548 local_work_size[2] = 1;
550 global_work_size[0] = plist->nsci * local_work_size[0];
551 global_work_size[1] = 1 * local_work_size[1];
552 global_work_size[2] = 1 * local_work_size[2];
554 validate_global_work_size(global_work_size, 3, nb->dev_info);
556 shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
560 static int run_step = 1;
562 if (DEBUG_RUN_STEP == run_step)
564 debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
565 debug_buffer_h = (float*)calloc(1, debug_buffer_size);
566 assert(NULL != debug_buffer_h);
568 if (NULL == nb->debug_buffer)
570 nb->debug_buffer = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
571 debug_buffer_size, debug_buffer_h, &cl_error);
573 assert(CL_SUCCESS == cl_error);
582 fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
583 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
584 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
585 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
586 c_numClPerSupercl, plist->na_c);
589 fillin_ocl_structures(nbp, &nbparams_params);
592 cl_error = CL_SUCCESS;
593 if (!useLjCombRule(nb->nbparam->vdwtype))
595 cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
597 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
598 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
599 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
600 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
601 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
602 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
603 if (useLjCombRule(nb->nbparam->vdwtype))
605 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
609 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
611 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
612 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
613 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
614 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
615 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
616 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
617 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
618 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
619 cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
620 cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
622 assert(cl_error == CL_SUCCESS);
626 printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
628 cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
629 assert(cl_error == CL_SUCCESS);
633 static int run_step = 1;
635 if (DEBUG_RUN_STEP == run_step)
638 char file_name[256] = {0};
640 ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
641 debug_buffer_size, stream, NULL);
643 // Make sure all data has been transfered back from device
646 printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
648 sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
649 pf = fopen(file_name, "wt");
652 fprintf(pf, "%20s", "");
653 for (int j = 0; j < global_work_size[0]; j++)
656 sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
657 fprintf(pf, "%20s", label);
660 for (int i = 0; i < global_work_size[1]; i++)
663 sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
664 fprintf(pf, "\n%20s", label);
666 for (int j = 0; j < global_work_size[0]; j++)
668 fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
679 free(debug_buffer_h);
680 debug_buffer_h = NULL;
689 /*! \brief Calculates the amount of shared memory required by the prune kernel.
691 * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
692 * for OpenCL local memory.
694 * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
695 * \returns the amount of local memory in bytes required by the pruning kernel
697 static inline int calc_shmem_required_prune(const int num_threads_z)
701 /* i-atom x in shared memory (for convenience we load all 4 components including q) */
702 shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
703 /* cj in shared memory, for each warp separately */
704 shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
705 /* Warp vote, requires one uint per warp/32 threads per block. */
706 shmem += sizeof(cl_uint) * 2*num_threads_z;
711 void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
717 cl_atomdata_t *adat = nb->atdat;
718 cl_nbparam_t *nbp = nb->nbparam;
719 cl_plist_t *plist = nb->plist[iloc];
720 cl_timers_t *t = nb->timers;
721 cl_command_queue stream = nb->stream[iloc];
722 bool bDoTime = nb->bDoTime;
724 if (plist->haveFreshList)
726 GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
728 /* Set rollingPruningNumParts to signal that it is not set */
729 plist->rollingPruningNumParts = 0;
730 plist->rollingPruningPart = 0;
734 if (plist->rollingPruningNumParts == 0)
736 plist->rollingPruningNumParts = numParts;
740 GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
744 /* Use a local variable for part and update in plist, so we can return here
745 * without duplicating the part increment code.
747 int part = plist->rollingPruningPart;
749 plist->rollingPruningPart++;
750 if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
752 plist->rollingPruningPart = 0;
755 /* Compute the number of list entries to prune in this pass */
756 int numSciInPart = (plist->nsci - part)/numParts;
758 /* Don't launch the kernel if there is no work to do. */
759 if (numSciInPart <= 0)
761 plist->haveFreshList = false;
766 /* Kernel launch config:
767 * - The thread block dimensions match the size of i-clusters, j-clusters,
768 * and j-cluster concurrency, in x, y, and z, respectively.
769 * - The 1D block-grid contains as many blocks as super-clusters.
771 int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
772 cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
774 /* kernel launch config */
775 size_t local_work_size[3], global_work_size[3];
776 local_work_size[0] = c_clSize;
777 local_work_size[1] = c_clSize;
778 local_work_size[2] = num_threads_z;
780 global_work_size[0] = numSciInPart * local_work_size[0];
781 global_work_size[1] = 1 * local_work_size[1];
782 global_work_size[2] = 1 * local_work_size[2];
784 validate_global_work_size(global_work_size, 3, nb->dev_info);
786 int shmem = calc_shmem_required_prune(num_threads_z);
790 fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
791 "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
793 (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
794 (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
795 c_numClPerSupercl, plist->na_c, shmem);
798 cl_nbparam_params_t nbparams_params;
799 fillin_ocl_structures(nbp, &nbparams_params);
802 cl_error = CL_SUCCESS;
804 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
805 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
806 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
807 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
808 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
809 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
810 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
811 cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
812 cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
813 assert(cl_error == CL_SUCCESS);
815 cl_event *pruneEventPtr = nullptr;
818 pruneEventPtr = plist->haveFreshList ? &t->prune_k[iloc] : &t->rollingPrune_k[iloc];
821 cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
822 nullptr, global_work_size, local_work_size,
823 0, nullptr, pruneEventPtr);
824 GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
826 if (plist->haveFreshList)
828 plist->haveFreshList = false;
829 /* Mark that pruning has been done */
830 nb->timers->didPrune[iloc] = true;
834 /* Mark that rolling pruning has been done */
835 nb->timers->didRollingPrune[iloc] = true;
840 * Launch asynchronously the download of nonbonded forces from the GPU
841 * (and energies/shift forces if required).
843 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
844 const struct nbnxn_atomdata_t *nbatom,
848 cl_int gmx_unused cl_error;
849 int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
852 /* determine interaction locality from atom locality */
857 else if (NONLOCAL_A(aloc))
864 sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
865 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
870 cl_atomdata_t *adat = nb->atdat;
871 cl_timers_t *t = nb->timers;
872 bool bDoTime = nb->bDoTime;
873 cl_command_queue stream = nb->stream[iloc];
875 bool bCalcEner = flags & GMX_FORCE_ENERGY;
876 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
879 /* don't launch non-local copy-back if there was no non-local work to do */
880 if (canSkipWork(nb, iloc))
882 /* TODO An alternative way to signal that non-local work is
883 complete is to use a clEnqueueMarker+clEnqueueBarrier
884 pair. However, the use of bNonLocalStreamActive has the
885 advantage of being local to the host, so probably minimizes
886 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
887 test case, overall simulation performance was higher with
888 the API calls, but this has not been tested on AMD OpenCL,
889 so could be worth considering in future. */
890 nb->bNonLocalStreamActive = false;
894 /* calculate the atom data index range based on locality */
898 adat_len = adat->natoms_local;
902 adat_begin = adat->natoms_local;
903 adat_len = adat->natoms - adat->natoms_local;
906 /* beginning of timed D2H section */
908 /* With DD the local D2H transfer can only start after the non-local
909 has been launched. */
910 if (iloc == eintLocal && nb->bNonLocalStreamActive)
912 sync_ocl_event(stream, &(nb->nonlocal_done));
916 ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
917 (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
920 cl_error = clFlush(stream);
921 assert(CL_SUCCESS == cl_error);
923 /* After the non-local D2H is launched the nonlocal_done event can be
924 recorded which signals that the local D2H can proceed. This event is not
925 placed after the non-local kernel because we first need the non-local
927 if (iloc == eintNonlocal)
929 #ifdef CL_VERSION_1_2
930 cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->nonlocal_done));
932 cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
934 assert(CL_SUCCESS == cl_error);
935 nb->bNonLocalStreamActive = true;
938 /* only transfer energies in the local stream */
944 ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
945 SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
951 ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
952 sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
954 ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
955 sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
960 /*! \brief Count pruning kernel time if either kernel has been triggered
962 * We do the accounting for either of the two pruning kernel flavors:
963 * - 1st pass prune: ran during the current step (prior to the force kernel);
964 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
966 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
967 * after calling this function.
969 * \param[inout] timers structs with OCL timer objects
970 * \param[inout] timings GPU task timing data
971 * \param[in] iloc interaction locality
973 static void countPruneKernelTime(cl_timers_t *timers,
974 gmx_wallclock_gpu_t *timings,
977 // We might have not done any pruning (e.g. if we skipped with empty domains).
978 if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
983 if (timers->didPrune[iloc])
985 timings->pruneTime.c++;
986 timings->pruneTime.t += ocl_event_elapsed_ms(&timers->prune_k[iloc]);
989 if (timers->didRollingPrune[iloc])
991 timings->dynamicPruneTime.c++;
992 timings->dynamicPruneTime.t += ocl_event_elapsed_ms(&timers->rollingPrune_k[iloc]);
997 * Wait for the asynchronously launched nonbonded calculations and data
998 * transfers to finish.
1000 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
1001 int flags, int aloc,
1002 real *e_lj, real *e_el, rvec *fshift)
1004 /* NOTE: only implemented for single-precision at this time */
1005 cl_int gmx_unused cl_error;
1008 /* determine interaction locality from atom locality */
1013 else if (NONLOCAL_A(aloc))
1015 iloc = eintNonlocal;
1020 sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
1021 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
1025 cl_plist_t *plist = nb->plist[iloc];
1026 cl_timers_t *timers = nb->timers;
1027 struct gmx_wallclock_gpu_t *timings = nb->timings;
1028 cl_nb_staging nbst = nb->nbst;
1030 bool bCalcEner = flags & GMX_FORCE_ENERGY;
1031 int bCalcFshift = flags & GMX_FORCE_VIRIAL;
1033 /* turn energy calculation always on/off (for debugging/testing only) */
1034 bCalcEner = (bCalcEner || always_ener) && !never_ener;
1036 /* Launch wait/update timers & counters and do reduction into staging buffers
1037 BUT skip it when during the non-local phase there was actually no work to do.
1038 This is consistent with nbnxn_gpu_launch_kernel.
1040 NOTE: if timing with multiple GPUs (streams) becomes possible, the
1041 counters could end up being inconsistent due to not being incremented
1042 on some of the nodes! */
1043 if (!canSkipWork(nb, iloc))
1045 /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
1046 cl_error = clFinish(nb->stream[iloc]);
1047 assert(CL_SUCCESS == cl_error);
1049 /* timing data accumulation */
1052 /* only increase counter once (at local F wait) */
1056 timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
1059 /* kernel timings */
1061 timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
1062 ocl_event_elapsed_ms(timers->nb_k + iloc);
1064 /* X/q H2D and F D2H timings */
1065 timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d + iloc);
1066 timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f + iloc);
1067 timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
1068 timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el + iloc);
1069 timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj + iloc);
1071 /* Count the pruning kernel times for both cases:1st pass (at search step)
1072 and rolling pruning (if called at the previous step).
1073 We do the accounting here as this is the only sync point where we
1074 know (without checking or additional sync-ing) that prune tasks in
1075 in the current stream have completed (having just blocking-waited
1076 for the force D2H). */
1077 countPruneKernelTime(timers, timings, iloc);
1079 /* only count atdat and pair-list H2D at pair-search step */
1080 if (timers->didPairlistH2D[iloc])
1082 /* atdat transfer timing (add only once, at local F wait) */
1085 timings->pl_h2d_c++;
1086 timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
1089 timings->pl_h2d_t +=
1090 ocl_event_elapsed_ms(timers->pl_h2d_sci + iloc) +
1091 ocl_event_elapsed_ms(timers->pl_h2d_cj4 + iloc) +
1092 ocl_event_elapsed_ms(timers->pl_h2d_excl + iloc);
1094 /* Clear the timing flag for the next step */
1095 timers->didPairlistH2D[iloc] = false;
1099 /* add up energies and shift forces (only once at local F wait) */
1104 *e_lj += *nbst.e_lj;
1105 *e_el += *nbst.e_el;
1110 for (int i = 0; i < SHIFTS; i++)
1112 fshift[i][0] += (nbst.fshift)[i][0];
1113 fshift[i][1] += (nbst.fshift)[i][1];
1114 fshift[i][2] += (nbst.fshift)[i][2];
1120 /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
1121 timers->didPrune[iloc] = timers->didRollingPrune[iloc] = false;
1123 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
1124 plist->haveFreshList = false;
1127 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
1128 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
1130 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
1133 /* Benchmarking/development environment variables to force the use of
1134 analytical or tabulated Ewald kernel. */
1135 bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
1136 bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
1138 if (bForceAnalyticalEwald && bForceTabulatedEwald)
1140 gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
1141 "requested through environment variables.");
1144 /* OpenCL: By default, use analytical Ewald
1145 * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
1147 * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
1150 //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1151 if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
1153 bUseAnalyticalEwald = true;
1157 fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
1162 bUseAnalyticalEwald = false;
1166 fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
1170 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
1171 forces it (use it for debugging/benchmarking only). */
1172 if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
1174 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
1178 kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;