src/gromacs/gmxlib/cuda_tools/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include <stdlib.h>
  37
  38 #include "gmx_fatal.h"
  39 #include "smalloc.h"
  40 #include "typedefs.h"
  41 #include "cudautils.cuh"
  42
  43 /*** Generic CUDA data operation wrappers ***/
  44
  45 /*! Launches synchronous or asynchronous host to device memory copy.
  46  *
  47  *  The copy is launched in stream s or if not specified, in stream 0.
  48  */
  49 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  50                                bool bAsync = false, cudaStream_t s = 0)
  51 {
  52     cudaError_t stat;
  53
  54     if (h_dest == NULL || d_src == NULL || bytes == 0)
  55     {
  56         return -1;
  57     }
  58
  59     if (bAsync)
  60     {
  61         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  62         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  63
  64     }
  65     else
  66     {
  67         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  68         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  69     }
  70
  71     return 0;
  72 }
  73
  74 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  75 {
  76     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  77 }
  78
  79 /*!
  80  *  The copy is launched in stream s or if not specified, in stream 0.
  81  */
  82 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  83 {
  84     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  85 }
  86
  87 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  88 {
  89     if (h_dest == NULL || d_src == NULL || bytes == 0)
  90     {
  91         return -1;
  92     }
  93
  94     smalloc(*h_dest, bytes);
  95
  96     return cu_copy_D2H(*h_dest, d_src, bytes);
  97 }
  98
  99 /*! Launches synchronous or asynchronous device to host memory copy.
 100  *
 101  *  The copy is launched in stream s or if not specified, in stream 0.
 102  */
 103 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 104                                bool bAsync = false, cudaStream_t s = 0)
 105 {
 106     cudaError_t stat;
 107
 108     if (d_dest == NULL || h_src == NULL || bytes == 0)
 109     {
 110         return -1;
 111     }
 112
 113     if (bAsync)
 114     {
 115         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 116         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 117     }
 118     else
 119     {
 120         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 121         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 122     }
 123
 124     return 0;
 125 }
 126
 127 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 128 {
 129     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 130 }
 131
 132 /*!
 133  *  The copy is launched in stream s or if not specified, in stream 0.
 134  */
 135 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 136 {
 137     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 138 }
 139
 140 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 141 {
 142     cudaError_t stat;
 143
 144     if (d_dest == NULL || h_src == NULL || bytes == 0)
 145     {
 146         return -1;
 147     }
 148
 149     stat = cudaMalloc(d_dest, bytes);
 150     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 151
 152     return cu_copy_H2D(*d_dest, h_src, bytes);
 153 }
 154
 155 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 156 {
 157     float       t = 0.0;
 158     cudaError_t stat;
 159
 160     stat = cudaEventElapsedTime(&t, start, end);
 161     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 162
 163     return t;
 164 }
 165
 166 int cu_wait_event(cudaEvent_t e)
 167 {
 168     cudaError_t s;
 169
 170     s = cudaEventSynchronize(e);
 171     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 172
 173     return 0;
 174 }
 175
 176 /*!
 177  *  If time != NULL it also calculates the time elapsed between start and end and
 178  *  return this is milliseconds.
 179  */
 180 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 181 {
 182     cudaError_t s;
 183
 184     s = cudaEventSynchronize(end);
 185     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 186
 187     if (time)
 188     {
 189         *time = cu_event_elapsed(start, end);
 190     }
 191
 192     return 0;
 193 }
 194
 195 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 196
 197 /*!
 198  * If the pointers to the size variables are NULL no resetting happens.
 199  */
 200 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 201 {
 202     cudaError_t stat;
 203
 204     if (d_ptr)
 205     {
 206         stat = cudaFree(d_ptr);
 207         CU_RET_ERR(stat, "cudaFree failed");
 208     }
 209
 210     if (n)
 211     {
 212         *n = -1;
 213     }
 214
 215     if (nalloc)
 216     {
 217         *nalloc = -1;
 218     }
 219 }
 220
 221 /*!
 222  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 223  *  the location pointed by h_src host-side pointer is done. Allocation is
 224  *  buffered and therefore freeing is only needed if the previously allocated
 225  *  space is not enough.
 226  *  The H2D copy is launched in stream s and can be done synchronously or
 227  *  asynchronously (the default is the latter).
 228  */
 229 void cu_realloc_buffered(void **d_dest, void *h_src,
 230                          size_t type_size,
 231                          int *curr_size, int *curr_alloc_size,
 232                          int req_size,
 233                          cudaStream_t s,
 234                          bool bAsync = true)
 235 {
 236     cudaError_t stat;
 237
 238     if (d_dest == NULL || req_size < 0)
 239     {
 240         return;
 241     }
 242
 243     /* reallocate only if the data does not fit = allocation size is smaller
 244        than the current requested size */
 245     if (req_size > *curr_alloc_size)
 246     {
 247         /* only free if the array has already been initialized */
 248         if (*curr_alloc_size >= 0)
 249         {
 250             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 251         }
 252
 253         *curr_alloc_size = over_alloc_large(req_size);
 254
 255         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 256         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 257     }
 258
 259     /* size could have changed without actual reallocation */
 260     *curr_size = req_size;
 261
 262     /* upload to device */
 263     if (h_src)
 264     {
 265         if (bAsync)
 266         {
 267             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 268         }
 269         else
 270         {
 271             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 272         }
 273     }
 274 }