src/gromacs/gmxlib/cuda_tools/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include <stdlib.h>
  39
  40 #include "gromacs/utility/smalloc.h"
  41 #include "gromacs/legacyheaders/typedefs.h"
  42 #include "cudautils.cuh"
  43
  44 /*** Generic CUDA data operation wrappers ***/
  45
  46 /*! Launches synchronous or asynchronous host to device memory copy.
  47  *
  48  *  The copy is launched in stream s or if not specified, in stream 0.
  49  */
  50 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  51                                bool bAsync = false, cudaStream_t s = 0)
  52 {
  53     cudaError_t stat;
  54
  55     if (h_dest == NULL || d_src == NULL || bytes == 0)
  56     {
  57         return -1;
  58     }
  59
  60     if (bAsync)
  61     {
  62         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  63         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  64
  65     }
  66     else
  67     {
  68         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  69         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  70     }
  71
  72     return 0;
  73 }
  74
  75 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  76 {
  77     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  78 }
  79
  80 /*!
  81  *  The copy is launched in stream s or if not specified, in stream 0.
  82  */
  83 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  84 {
  85     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  86 }
  87
  88 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  89 {
  90     if (h_dest == NULL || d_src == NULL || bytes == 0)
  91     {
  92         return -1;
  93     }
  94
  95     smalloc(*h_dest, bytes);
  96
  97     return cu_copy_D2H(*h_dest, d_src, bytes);
  98 }
  99
 100 /*! Launches synchronous or asynchronous device to host memory copy.
 101  *
 102  *  The copy is launched in stream s or if not specified, in stream 0.
 103  */
 104 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 105                                bool bAsync = false, cudaStream_t s = 0)
 106 {
 107     cudaError_t stat;
 108
 109     if (d_dest == NULL || h_src == NULL || bytes == 0)
 110     {
 111         return -1;
 112     }
 113
 114     if (bAsync)
 115     {
 116         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 117         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 118     }
 119     else
 120     {
 121         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 122         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 123     }
 124
 125     return 0;
 126 }
 127
 128 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 129 {
 130     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 131 }
 132
 133 /*!
 134  *  The copy is launched in stream s or if not specified, in stream 0.
 135  */
 136 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 137 {
 138     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 139 }
 140
 141 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 142 {
 143     cudaError_t stat;
 144
 145     if (d_dest == NULL || h_src == NULL || bytes == 0)
 146     {
 147         return -1;
 148     }
 149
 150     stat = cudaMalloc(d_dest, bytes);
 151     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 152
 153     return cu_copy_H2D(*d_dest, h_src, bytes);
 154 }
 155
 156 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 157 {
 158     float       t = 0.0;
 159     cudaError_t stat;
 160
 161     stat = cudaEventElapsedTime(&t, start, end);
 162     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 163
 164     return t;
 165 }
 166
 167 int cu_wait_event(cudaEvent_t e)
 168 {
 169     cudaError_t s;
 170
 171     s = cudaEventSynchronize(e);
 172     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 173
 174     return 0;
 175 }
 176
 177 /*!
 178  *  If time != NULL it also calculates the time elapsed between start and end and
 179  *  return this is milliseconds.
 180  */
 181 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 182 {
 183     cudaError_t s;
 184
 185     s = cudaEventSynchronize(end);
 186     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 187
 188     if (time)
 189     {
 190         *time = cu_event_elapsed(start, end);
 191     }
 192
 193     return 0;
 194 }
 195
 196 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 197
 198 /*!
 199  * If the pointers to the size variables are NULL no resetting happens.
 200  */
 201 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 202 {
 203     cudaError_t stat;
 204
 205     if (d_ptr)
 206     {
 207         stat = cudaFree(d_ptr);
 208         CU_RET_ERR(stat, "cudaFree failed");
 209     }
 210
 211     if (n)
 212     {
 213         *n = -1;
 214     }
 215
 216     if (nalloc)
 217     {
 218         *nalloc = -1;
 219     }
 220 }
 221
 222 /*!
 223  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 224  *  the location pointed by h_src host-side pointer is done. Allocation is
 225  *  buffered and therefore freeing is only needed if the previously allocated
 226  *  space is not enough.
 227  *  The H2D copy is launched in stream s and can be done synchronously or
 228  *  asynchronously (the default is the latter).
 229  */
 230 void cu_realloc_buffered(void **d_dest, void *h_src,
 231                          size_t type_size,
 232                          int *curr_size, int *curr_alloc_size,
 233                          int req_size,
 234                          cudaStream_t s,
 235                          bool bAsync = true)
 236 {
 237     cudaError_t stat;
 238
 239     if (d_dest == NULL || req_size < 0)
 240     {
 241         return;
 242     }
 243
 244     /* reallocate only if the data does not fit = allocation size is smaller
 245        than the current requested size */
 246     if (req_size > *curr_alloc_size)
 247     {
 248         /* only free if the array has already been initialized */
 249         if (*curr_alloc_size >= 0)
 250         {
 251             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 252         }
 253
 254         *curr_alloc_size = over_alloc_large(req_size);
 255
 256         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 257         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 258     }
 259
 260     /* size could have changed without actual reallocation */
 261     *curr_size = req_size;
 262
 263     /* upload to device */
 264     if (h_src)
 265     {
 266         if (bAsync)
 267         {
 268             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 269         }
 270         else
 271         {
 272             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 273         }
 274     }
 275 }