2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define utility routines for OpenCL
38 * \author Anca Hamuraru <anca@streamcomputing.eu>
51 #include "gromacs/gpu_utils/gpu_utils.h"
52 #include "gromacs/utility/fatalerror.h"
53 #include "gromacs/utility/smalloc.h"
55 int ocl_copy_H2D(cl_mem d_dest, const void* h_src,
56 size_t offset, size_t bytes,
57 GpuApiCallBehavior transferKind,
58 cl_command_queue command_queue,
61 cl_int gmx_unused cl_error;
63 if (d_dest == nullptr || h_src == nullptr || bytes == 0)
70 case GpuApiCallBehavior::Async:
71 cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, nullptr, copy_event);
72 assert(cl_error == CL_SUCCESS);
73 // TODO: handle errors
76 case GpuApiCallBehavior::Sync:
77 cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, nullptr, copy_event);
78 assert(cl_error == CL_SUCCESS);
79 // TODO: handle errors
89 /*! \brief Launches asynchronous host to device memory copy.
91 * If copy_event is not nullptr, on return it will contain an event object
92 * identifying this particular host to device operation. The event can further
93 * be used to queue a wait for this operation or to query profiling information.
95 int ocl_copy_H2D_async(cl_mem d_dest, const void * h_src,
96 size_t offset, size_t bytes,
97 cl_command_queue command_queue,
100 return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
103 /*! \brief Launches synchronous host to device memory copy.
105 int ocl_copy_H2D_sync(cl_mem d_dest, const void * h_src,
106 size_t offset, size_t bytes,
107 cl_command_queue command_queue)
109 return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, nullptr);
112 int ocl_copy_D2H(void * h_dest, cl_mem d_src,
113 size_t offset, size_t bytes,
114 GpuApiCallBehavior transferKind,
115 cl_command_queue command_queue,
116 cl_event *copy_event)
118 cl_int gmx_unused cl_error;
120 if (h_dest == nullptr || d_src == nullptr || bytes == 0)
125 switch (transferKind)
127 case GpuApiCallBehavior::Async:
128 cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, nullptr, copy_event);
129 assert(cl_error == CL_SUCCESS);
130 // TODO: handle errors
133 case GpuApiCallBehavior::Sync:
134 cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, nullptr, copy_event);
135 assert(cl_error == CL_SUCCESS);
136 // TODO: handle errors
146 /*! \brief Launches asynchronous device to host memory copy.
148 * If copy_event is not nullptr, on return it will contain an event object
149 * identifying this particular host to device operation. The event can further
150 * be used to queue a wait for this operation or to query profiling information.
152 int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
153 size_t offset, size_t bytes,
154 cl_command_queue command_queue,
155 cl_event *copy_event)
157 return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
160 /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
163 * This function should allocate page-locked memory to help reduce D2H and H2D
164 * transfer times, similar with pmalloc from pmalloc_cuda.cu.
166 * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer.
167 * \param[in] nbytes Size in bytes of the buffer to be allocated.
169 void pmalloc(void **h_ptr, size_t nbytes)
171 /* Need a temporary type whose size is 1 byte, so that the
172 * implementation of snew_aligned can cope without issuing
174 char **temporary = reinterpret_cast<char **>(h_ptr);
176 /* 16-byte alignment is required by the neighbour-searching code,
177 * because it uses four-wide SIMD for bounding-box calculation.
178 * However, when we organize using page-locked memory for
179 * device-host transfers, it will probably need to be aligned to a
180 * 4kb page, like CUDA does. */
181 snew_aligned(*temporary, nbytes, 16);
184 /*! \brief Frees memory allocated with pmalloc.
186 * \param[in] h_ptr Buffer allocated with pmalloc that needs to be freed.
188 void pfree(void *h_ptr)
193 sfree_aligned(h_ptr);
197 /*! \brief Convert error code to diagnostic string */
198 std::string ocl_get_error_string(cl_int error)
202 // run-time and JIT compiler errors
203 case 0: return "CL_SUCCESS";
204 case -1: return "CL_DEVICE_NOT_FOUND";
205 case -2: return "CL_DEVICE_NOT_AVAILABLE";
206 case -3: return "CL_COMPILER_NOT_AVAILABLE";
207 case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
208 case -5: return "CL_OUT_OF_RESOURCES";
209 case -6: return "CL_OUT_OF_HOST_MEMORY";
210 case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
211 case -8: return "CL_MEM_COPY_OVERLAP";
212 case -9: return "CL_IMAGE_FORMAT_MISMATCH";
213 case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
214 case -11: return "CL_BUILD_PROGRAM_FAILURE";
215 case -12: return "CL_MAP_FAILURE";
216 case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
217 case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
218 case -15: return "CL_COMPILE_PROGRAM_FAILURE";
219 case -16: return "CL_LINKER_NOT_AVAILABLE";
220 case -17: return "CL_LINK_PROGRAM_FAILURE";
221 case -18: return "CL_DEVICE_PARTITION_FAILED";
222 case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
224 // compile-time errors
225 case -30: return "CL_INVALID_VALUE";
226 case -31: return "CL_INVALID_DEVICE_TYPE";
227 case -32: return "CL_INVALID_PLATFORM";
228 case -33: return "CL_INVALID_DEVICE";
229 case -34: return "CL_INVALID_CONTEXT";
230 case -35: return "CL_INVALID_QUEUE_PROPERTIES";
231 case -36: return "CL_INVALID_COMMAND_QUEUE";
232 case -37: return "CL_INVALID_HOST_PTR";
233 case -38: return "CL_INVALID_MEM_OBJECT";
234 case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
235 case -40: return "CL_INVALID_IMAGE_SIZE";
236 case -41: return "CL_INVALID_SAMPLER";
237 case -42: return "CL_INVALID_BINARY";
238 case -43: return "CL_INVALID_BUILD_OPTIONS";
239 case -44: return "CL_INVALID_PROGRAM";
240 case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
241 case -46: return "CL_INVALID_KERNEL_NAME";
242 case -47: return "CL_INVALID_KERNEL_DEFINITION";
243 case -48: return "CL_INVALID_KERNEL";
244 case -49: return "CL_INVALID_ARG_INDEX";
245 case -50: return "CL_INVALID_ARG_VALUE";
246 case -51: return "CL_INVALID_ARG_SIZE";
247 case -52: return "CL_INVALID_KERNEL_ARGS";
248 case -53: return "CL_INVALID_WORK_DIMENSION";
249 case -54: return "CL_INVALID_WORK_GROUP_SIZE";
250 case -55: return "CL_INVALID_WORK_ITEM_SIZE";
251 case -56: return "CL_INVALID_GLOBAL_OFFSET";
252 case -57: return "CL_INVALID_EVENT_WAIT_LIST";
253 case -58: return "CL_INVALID_EVENT";
254 case -59: return "CL_INVALID_OPERATION";
255 case -60: return "CL_INVALID_GL_OBJECT";
256 case -61: return "CL_INVALID_BUFFER_SIZE";
257 case -62: return "CL_INVALID_MIP_LEVEL";
258 case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
259 case -64: return "CL_INVALID_PROPERTY";
260 case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
261 case -66: return "CL_INVALID_COMPILER_OPTIONS";
262 case -67: return "CL_INVALID_LINKER_OPTIONS";
263 case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
266 case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
267 case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
268 case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
269 case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
270 case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
271 case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
272 default: return "Unknown OpenCL error: " +
273 std::to_string(static_cast<int32_t>(error));