src/gmxlib/cuda_tools/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40
  41 #include "gmx_fatal.h"
  42 #include "smalloc.h"
  43 #include "typedefs.h"
  44 #include "cudautils.cuh"
  45
  46 /*** Generic CUDA data operation wrappers ***/
  47
  48 /*! Launches synchronous or asynchronous host to device memory copy.
  49  *
  50  *  The copy is launched in stream s or if not specified, in stream 0.
  51  */
  52 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  53                                bool bAsync = false, cudaStream_t s = 0)
  54 {
  55     cudaError_t stat;
  56
  57     if (h_dest == NULL || d_src == NULL || bytes == 0)
  58         return -1;
  59
  60     if (bAsync)
  61     {
  62         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  63         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  64
  65     }
  66     else
  67     {
  68         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  69         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  70     }
  71
  72     return 0;
  73 }
  74
  75 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  76 {
  77     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  78 }
  79
  80 /*!
  81  *  The copy is launched in stream s or if not specified, in stream 0.
  82  */
  83 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  84 {
  85     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  86 }
  87
  88 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  89 {
  90     if (h_dest == NULL || d_src == NULL || bytes == 0)
  91         return -1;
  92
  93     smalloc(*h_dest, bytes);
  94
  95     return cu_copy_D2H(*h_dest, d_src, bytes);
  96 }
  97
  98 /*! Launches synchronous or asynchronous device to host memory copy.
  99  *
 100  *  The copy is launched in stream s or if not specified, in stream 0.
 101  */
 102 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 103                                bool bAsync = false, cudaStream_t s = 0)
 104 {
 105     cudaError_t stat;
 106
 107     if (d_dest == NULL || h_src == NULL || bytes == 0)
 108         return -1;
 109
 110     if (bAsync)
 111     {
 112         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 113         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 114     }
 115     else
 116     {
 117         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 118         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 119     }
 120
 121     return 0;
 122 }
 123
 124 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 125 {
 126     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 127 }
 128
 129 /*!
 130  *  The copy is launched in stream s or if not specified, in stream 0.
 131  */
 132 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 133 {
 134     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 135 }
 136
 137 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 138 {
 139     cudaError_t stat;
 140
 141     if (d_dest == NULL || h_src == NULL || bytes == 0)
 142         return -1;
 143
 144     stat = cudaMalloc(d_dest, bytes);
 145     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 146
 147     return cu_copy_H2D(*d_dest, h_src, bytes);
 148 }
 149
 150 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 151 {
 152     float t = 0.0;
 153     cudaError_t stat;
 154
 155     stat = cudaEventElapsedTime(&t, start, end);
 156     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 157
 158     return t;
 159 }
 160
 161 int cu_wait_event(cudaEvent_t e)
 162 {
 163     cudaError_t s;
 164
 165     s = cudaEventSynchronize(e);
 166     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 167
 168     return 0;
 169 }
 170
 171 /*!
 172  *  If time != NULL it also calculates the time elapsed between start and end and
 173  *  return this is milliseconds.
 174  */
 175 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 176 {
 177     cudaError_t s;
 178
 179     s = cudaEventSynchronize(end);
 180     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 181
 182     if (time)
 183     {
 184         *time = cu_event_elapsed(start, end);
 185     }
 186
 187     return 0;
 188 }
 189
 190 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 191
 192 /*!
 193  * If the pointers to the size variables are NULL no resetting happens.
 194  */
 195 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 196 {
 197     cudaError_t stat;
 198
 199     if (d_ptr)
 200     {
 201         stat = cudaFree(d_ptr);
 202         CU_RET_ERR(stat, "cudaFree failed");
 203     }
 204
 205     if (n)
 206     {
 207         *n = -1;
 208     }
 209
 210     if (nalloc)
 211     {
 212         *nalloc = -1;
 213     }
 214 }
 215
 216 /*!
 217  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 218  *  the location pointed by h_src host-side pointer is done. Allocation is
 219  *  buffered and therefore freeing is only needed if the previously allocated
 220  *  space is not enough.
 221  *  The H2D copy is launched in stream s and can be done synchronously or
 222  *  asynchronously (the default is the latter).
 223  */
 224 void cu_realloc_buffered(void **d_dest, void *h_src,
 225                          size_t type_size,
 226                          int *curr_size, int *curr_alloc_size,
 227                          int req_size,
 228                          cudaStream_t s,
 229                          bool bAsync = true)
 230 {
 231     cudaError_t stat;
 232
 233     if (d_dest == NULL || req_size < 0)
 234     {
 235         return;
 236     }
 237
 238     /* reallocate only if the data does not fit = allocation size is smaller
 239        than the current requested size */
 240     if (req_size > *curr_alloc_size)
 241     {
 242         /* only free if the array has already been initialized */
 243         if (*curr_alloc_size >= 0)
 244         {
 245             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 246         }
 247
 248         *curr_alloc_size = over_alloc_large(req_size);
 249
 250         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 251         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 252     }
 253
 254     /* size could have changed without actual reallocation */
 255     *curr_size = req_size;
 256
 257     /* upload to device */
 258     if (h_src)
 259     {
 260         if (bAsync)
 261         {
 262             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 263         }
 264         else
 265         {
 266             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 267         }
 268     }
 269 }