2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #include "gmx_fatal.h"
44 #include "cudautils.cuh"
46 /*** Generic CUDA data operation wrappers ***/
48 /*! Launches synchronous or asynchronous host to device memory copy.
50 * The copy is launched in stream s or if not specified, in stream 0.
52 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
53 bool bAsync = false, cudaStream_t s = 0)
57 if (h_dest == NULL || d_src == NULL || bytes == 0)
62 stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
63 CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
68 stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
69 CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
75 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
77 return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
81 * The copy is launched in stream s or if not specified, in stream 0.
83 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
85 return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
88 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
90 if (h_dest == NULL || d_src == NULL || bytes == 0)
93 smalloc(*h_dest, bytes);
95 return cu_copy_D2H(*h_dest, d_src, bytes);
98 /*! Launches synchronous or asynchronous device to host memory copy.
100 * The copy is launched in stream s or if not specified, in stream 0.
102 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
103 bool bAsync = false, cudaStream_t s = 0)
107 if (d_dest == NULL || h_src == NULL || bytes == 0)
112 stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
113 CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
117 stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
118 CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
124 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
126 return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
130 * The copy is launched in stream s or if not specified, in stream 0.
132 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
134 return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
137 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
141 if (d_dest == NULL || h_src == NULL || bytes == 0)
144 stat = cudaMalloc(d_dest, bytes);
145 CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
147 return cu_copy_H2D(*d_dest, h_src, bytes);
150 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
155 stat = cudaEventElapsedTime(&t, start, end);
156 CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
161 int cu_wait_event(cudaEvent_t e)
165 s = cudaEventSynchronize(e);
166 CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
172 * If time != NULL it also calculates the time elapsed between start and end and
173 * return this is milliseconds.
175 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
179 s = cudaEventSynchronize(end);
180 CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
184 *time = cu_event_elapsed(start, end);
190 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
193 * If the pointers to the size variables are NULL no resetting happens.
195 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
201 stat = cudaFree(d_ptr);
202 CU_RET_ERR(stat, "cudaFree failed");
217 * Reallocation of the memory pointed by d_ptr and copying of the data from
218 * the location pointed by h_src host-side pointer is done. Allocation is
219 * buffered and therefore freeing is only needed if the previously allocated
220 * space is not enough.
221 * The H2D copy is launched in stream s and can be done synchronously or
222 * asynchronously (the default is the latter).
224 void cu_realloc_buffered(void **d_dest, void *h_src,
226 int *curr_size, int *curr_alloc_size,
233 if (d_dest == NULL || req_size < 0)
238 /* reallocate only if the data does not fit = allocation size is smaller
239 than the current requested size */
240 if (req_size > *curr_alloc_size)
242 /* only free if the array has already been initialized */
243 if (*curr_alloc_size >= 0)
245 cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
248 *curr_alloc_size = over_alloc_large(req_size);
250 stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
251 CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
254 /* size could have changed without actual reallocation */
255 *curr_size = req_size;
257 /* upload to device */
262 cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
266 cu_copy_H2D(*d_dest, h_src, *curr_size * type_size);