src/gromacs/gmxlib/cuda_tools/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include <stdlib.h>
  37
  38 #include "gromacs/utility/smalloc.h"
  39 #include "typedefs.h"
  40 #include "cudautils.cuh"
  41
  42 /*** Generic CUDA data operation wrappers ***/
  43
  44 /*! Launches synchronous or asynchronous host to device memory copy.
  45  *
  46  *  The copy is launched in stream s or if not specified, in stream 0.
  47  */
  48 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  49                                bool bAsync = false, cudaStream_t s = 0)
  50 {
  51     cudaError_t stat;
  52
  53     if (h_dest == NULL || d_src == NULL || bytes == 0)
  54     {
  55         return -1;
  56     }
  57
  58     if (bAsync)
  59     {
  60         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  61         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  62
  63     }
  64     else
  65     {
  66         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  67         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  68     }
  69
  70     return 0;
  71 }
  72
  73 int cu_copy_D2H(void * h_dest, void * d_src, size_t bytes)
  74 {
  75     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  76 }
  77
  78 /*!
  79  *  The copy is launched in stream s or if not specified, in stream 0.
  80  */
  81 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  82 {
  83     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  84 }
  85
  86 int cu_copy_D2H_alloc(void ** h_dest, void * d_src, size_t bytes)
  87 {
  88     if (h_dest == NULL || d_src == NULL || bytes == 0)
  89     {
  90         return -1;
  91     }
  92
  93     smalloc(*h_dest, bytes);
  94
  95     return cu_copy_D2H(*h_dest, d_src, bytes);
  96 }
  97
  98 /*! Launches synchronous or asynchronous device to host memory copy.
  99  *
 100  *  The copy is launched in stream s or if not specified, in stream 0.
 101  */
 102 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 103                                bool bAsync = false, cudaStream_t s = 0)
 104 {
 105     cudaError_t stat;
 106
 107     if (d_dest == NULL || h_src == NULL || bytes == 0)
 108     {
 109         return -1;
 110     }
 111
 112     if (bAsync)
 113     {
 114         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 115         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 116     }
 117     else
 118     {
 119         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 120         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 121     }
 122
 123     return 0;
 124 }
 125
 126 int cu_copy_H2D(void * d_dest, void * h_src, size_t bytes)
 127 {
 128     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 129 }
 130
 131 /*!
 132  *  The copy is launched in stream s or if not specified, in stream 0.
 133  */
 134 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 135 {
 136     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 137 }
 138
 139 int cu_copy_H2D_alloc(void ** d_dest, void * h_src, size_t bytes)
 140 {
 141     cudaError_t stat;
 142
 143     if (d_dest == NULL || h_src == NULL || bytes == 0)
 144     {
 145         return -1;
 146     }
 147
 148     stat = cudaMalloc(d_dest, bytes);
 149     CU_RET_ERR(stat, "cudaMalloc failed in cu_copy_H2D_alloc");
 150
 151     return cu_copy_H2D(*d_dest, h_src, bytes);
 152 }
 153
 154 float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
 155 {
 156     float       t = 0.0;
 157     cudaError_t stat;
 158
 159     stat = cudaEventElapsedTime(&t, start, end);
 160     CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
 161
 162     return t;
 163 }
 164
 165 int cu_wait_event(cudaEvent_t e)
 166 {
 167     cudaError_t s;
 168
 169     s = cudaEventSynchronize(e);
 170     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 171
 172     return 0;
 173 }
 174
 175 /*!
 176  *  If time != NULL it also calculates the time elapsed between start and end and
 177  *  return this is milliseconds.
 178  */
 179 int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
 180 {
 181     cudaError_t s;
 182
 183     s = cudaEventSynchronize(end);
 184     CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
 185
 186     if (time)
 187     {
 188         *time = cu_event_elapsed(start, end);
 189     }
 190
 191     return 0;
 192 }
 193
 194 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 195
 196 /*!
 197  * If the pointers to the size variables are NULL no resetting happens.
 198  */
 199 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 200 {
 201     cudaError_t stat;
 202
 203     if (d_ptr)
 204     {
 205         stat = cudaFree(d_ptr);
 206         CU_RET_ERR(stat, "cudaFree failed");
 207     }
 208
 209     if (n)
 210     {
 211         *n = -1;
 212     }
 213
 214     if (nalloc)
 215     {
 216         *nalloc = -1;
 217     }
 218 }
 219
 220 /*!
 221  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 222  *  the location pointed by h_src host-side pointer is done. Allocation is
 223  *  buffered and therefore freeing is only needed if the previously allocated
 224  *  space is not enough.
 225  *  The H2D copy is launched in stream s and can be done synchronously or
 226  *  asynchronously (the default is the latter).
 227  */
 228 void cu_realloc_buffered(void **d_dest, void *h_src,
 229                          size_t type_size,
 230                          int *curr_size, int *curr_alloc_size,
 231                          int req_size,
 232                          cudaStream_t s,
 233                          bool bAsync = true)
 234 {
 235     cudaError_t stat;
 236
 237     if (d_dest == NULL || req_size < 0)
 238     {
 239         return;
 240     }
 241
 242     /* reallocate only if the data does not fit = allocation size is smaller
 243        than the current requested size */
 244     if (req_size > *curr_alloc_size)
 245     {
 246         /* only free if the array has already been initialized */
 247         if (*curr_alloc_size >= 0)
 248         {
 249             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 250         }
 251
 252         *curr_alloc_size = over_alloc_large(req_size);
 253
 254         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 255         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 256     }
 257
 258     /* size could have changed without actual reallocation */
 259     *curr_size = req_size;
 260
 261     /* upload to device */
 262     if (h_src)
 263     {
 264         if (bAsync)
 265         {
 266             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 267         }
 268         else
 269         {
 270             cu_copy_H2D(*d_dest, h_src,  *curr_size * type_size);
 271         }
 272     }
 273 }