src/gromacs/gmxlib/cuda_tools/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "cudautils.cuh"
  39
  40 #include <stdlib.h>
  41
  42 #include "gromacs/legacyheaders/typedefs.h"
  43 #include "gromacs/utility/smalloc.h"
  44
  45 /*** Generic CUDA data operation wrappers ***/
  46
  47 /*! Launches synchronous or asynchronous host to device memory copy.
  48  *
  49  *  The copy is launched in stream s or if not specified, in stream 0.
  50  */
  51 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  52                                bool bAsync = false, cudaStream_t s = 0)
  53 {
  54     cudaError_t stat;
  55
  56     if (h_dest == NULL || d_src == NULL || bytes == 0)
  57     {
  58         return -1;
  59     }
  60
  61     if (bAsync)
  62     {
  63         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  64         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  65
  66     }
  67     else
  68     {
  69         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  70         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  71     }
  72
  73     return 0;
  74 }
  75
  76 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  77 {
  78     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  79 }
  80
  81 /*!
  82  *  The copy is launched in stream s or if not specified, in stream 0.
  83  */
  84 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  85 {
  86     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  87 }
  88
  89 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  90 {
  91     if (h_dest == NULL || d_src == NULL || bytes == 0)
  92     {
  93         return -1;
  94     }
  95
  96     smalloc(*h_dest, bytes);
  97
  98     return cu_copy_D2H(*h_dest, d_src, bytes);
  99 }
 100
 101 /*! Launches synchronous or asynchronous device to host memory copy.
 102  *
 103  *  The copy is launched in stream s or if not specified, in stream 0.
 104  */
 105 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 106                                bool bAsync = false, cudaStream_t s = 0)
 107 {
 108     cudaError_t stat;
 109
 110     if (d_dest == NULL || h_src == NULL || bytes == 0)
 111     {
 112         return -1;
 113     }
 114
 115     if (bAsync)
 116     {
 117         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 118         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 119     }
 120     else
 121     {
 122         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 123         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 124     }
 125
 126     return 0;
 127 }
 128
 129 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 130 {
 131     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 132 }
 133
 134 /*!
 135  *  The copy is launched in stream s or if not specified, in stream 0.
 136  */
 137 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 138 {
 139     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 140 }
 141
 142 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 143 {
 144     cudaError_t stat;
 145
 146     if (d_dest == NULL || h_src == NULL || bytes == 0)
 147     {
 148         return -1;
 149     }
 150
 151     stat = cudaMalloc(d_dest, bytes);
 152     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 153
 154     return cu_copy_H2D(*d_dest, h_src, bytes);
 155 }
 156
 157 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 158 {
 159     float       t = 0.0;
 160     cudaError_t stat;
 161
 162     stat = cudaEventElapsedTime(&t, start, end);
 163     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 164
 165     return t;
 166 }
 167
 168 int cu_wait_event(cudaEvent_t e)
 169 {
 170     cudaError_t s;
 171
 172     s = cudaEventSynchronize(e);
 173     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 174
 175     return 0;
 176 }
 177
 178 /*!
 179  *  If time != NULL it also calculates the time elapsed between start and end and
 180  *  return this is milliseconds.
 181  */
 182 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 183 {
 184     cudaError_t s;
 185
 186     s = cudaEventSynchronize(end);
 187     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 188
 189     if (time)
 190     {
 191         *time = cu_event_elapsed(start, end);
 192     }
 193
 194     return 0;
 195 }
 196
 197 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 198
 199 /*!
 200  * If the pointers to the size variables are NULL no resetting happens.
 201  */
 202 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 203 {
 204     cudaError_t stat;
 205
 206     if (d_ptr)
 207     {
 208         stat = cudaFree(d_ptr);
 209         CU_RET_ERR(stat, "cudaFree failed");
 210     }
 211
 212     if (n)
 213     {
 214         *n = -1;
 215     }
 216
 217     if (nalloc)
 218     {
 219         *nalloc = -1;
 220     }
 221 }
 222
 223 /*!
 224  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 225  *  the location pointed by h_src host-side pointer is done. Allocation is
 226  *  buffered and therefore freeing is only needed if the previously allocated
 227  *  space is not enough.
 228  *  The H2D copy is launched in stream s and can be done synchronously or
 229  *  asynchronously (the default is the latter).
 230  */
 231 void cu_realloc_buffered(void **d_dest, void *h_src,
 232                          size_t type_size,
 233                          int *curr_size, int *curr_alloc_size,
 234                          int req_size,
 235                          cudaStream_t s,
 236                          bool bAsync = true)
 237 {
 238     cudaError_t stat;
 239
 240     if (d_dest == NULL || req_size < 0)
 241     {
 242         return;
 243     }
 244
 245     /* reallocate only if the data does not fit = allocation size is smaller
 246        than the current requested size */
 247     if (req_size > *curr_alloc_size)
 248     {
 249         /* only free if the array has already been initialized */
 250         if (*curr_alloc_size >= 0)
 251         {
 252             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 253         }
 254
 255         *curr_alloc_size = over_alloc_large(req_size);
 256
 257         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 258         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 259     }
 260
 261     /* size could have changed without actual reallocation */
 262     *curr_size = req_size;
 263
 264     /* upload to device */
 265     if (h_src)
 266     {
 267         if (bAsync)
 268         {
 269             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 270         }
 271         else
 272         {
 273             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 274         }
 275     }
 276 }