src/gromacs/gpu_utils/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "cudautils.cuh"
  39
  40 #include <cassert>
  41 #include <cstdlib>
  42
  43 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  44 #include "gromacs/utility/smalloc.h"
  45
  46 /*** Generic CUDA data operation wrappers ***/
  47
  48 /*! Launches synchronous or asynchronous host to device memory copy.
  49  *
  50  *  The copy is launched in stream s or if not specified, in stream 0.
  51  */
  52 static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  53                                bool bAsync = false, cudaStream_t s = 0)
  54 {
  55     cudaError_t stat;
  56
  57     if (h_dest == NULL || d_src == NULL || bytes == 0)
  58     {
  59         return -1;
  60     }
  61
  62     if (bAsync)
  63     {
  64         stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  65         CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  66
  67     }
  68     else
  69     {
  70         stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  71         CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  72     }
  73
  74     return 0;
  75 }
  76
  77 int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  78 {
  79     return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
  80 }
  81
  82 /*!
  83  *  The copy is launched in stream s or if not specified, in stream 0.
  84  */
  85 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  86 {
  87     return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
  88 }
  89
  90 /*! Launches synchronous or asynchronous device to host memory copy.
  91  *
  92  *  The copy is launched in stream s or if not specified, in stream 0.
  93  */
  94 static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
  95                                bool bAsync = false, cudaStream_t s = 0)
  96 {
  97     cudaError_t stat;
  98
  99     if (d_dest == NULL || h_src == NULL || bytes == 0)
 100     {
 101         return -1;
 102     }
 103
 104     if (bAsync)
 105     {
 106         stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 107         CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 108     }
 109     else
 110     {
 111         stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 112         CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 113     }
 114
 115     return 0;
 116 }
 117
 118 int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
 119 {
 120     return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
 121 }
 122
 123 /*!
 124  *  The copy is launched in stream s or if not specified, in stream 0.
 125  */
 126 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 127 {
 128     return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
 129 }
 130
 131 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 132
 133 /*!
 134  * If the pointers to the size variables are NULL no resetting happens.
 135  */
 136 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 137 {
 138     cudaError_t stat;
 139
 140     if (d_ptr)
 141     {
 142         stat = cudaFree(d_ptr);
 143         CU_RET_ERR(stat, "cudaFree failed");
 144     }
 145
 146     if (n)
 147     {
 148         *n = -1;
 149     }
 150
 151     if (nalloc)
 152     {
 153         *nalloc = -1;
 154     }
 155 }
 156
 157 /*!
 158  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 159  *  the location pointed by h_src host-side pointer is done. Allocation is
 160  *  buffered and therefore freeing is only needed if the previously allocated
 161  *  space is not enough.
 162  *  The H2D copy is launched in stream s and can be done synchronously or
 163  *  asynchronously (the default is the latter).
 164  */
 165 void cu_realloc_buffered(void **d_dest, void *h_src,
 166                          size_t type_size,
 167                          int *curr_size, int *curr_alloc_size,
 168                          int req_size,
 169                          cudaStream_t s,
 170                          bool bAsync = true)
 171 {
 172     cudaError_t stat;
 173
 174     if (d_dest == NULL || req_size < 0)
 175     {
 176         return;
 177     }
 178
 179     /* reallocate only if the data does not fit = allocation size is smaller
 180        than the current requested size */
 181     if (req_size > *curr_alloc_size)
 182     {
 183         /* only free if the array has already been initialized */
 184         if (*curr_alloc_size >= 0)
 185         {
 186             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 187         }
 188
 189         *curr_alloc_size = over_alloc_large(req_size);
 190
 191         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 192         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 193     }
 194
 195     /* size could have changed without actual reallocation */
 196     *curr_size = req_size;
 197
 198     /* upload to device */
 199     if (h_src)
 200     {
 201         if (bAsync)
 202         {
 203             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 204         }
 205         else
 206         {
 207             cu_copy_H2D_sync(*d_dest, h_src,  *curr_size * type_size);
 208         }
 209     }
 210 }
 211
 212 /*! \brief Return whether texture objects are used on this device.
 213  *
 214  * \param[in]   pointer to the GPU device info structure to inspect for texture objects support
 215  * \return      true if texture objects are used on this device
 216  */
 217 static inline bool use_texobj(const gmx_device_info_t *dev_info)
 218 {
 219     assert(!c_disableCudaTextures);
 220     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 221     return (dev_info->prop.major >= 3);
 222 }
 223
 224 /*! \brief Set up texture object for an array of type T.
 225  *
 226  * Set up texture object for an array of type T and bind it to the device memory
 227  * \p d_ptr points to.
 228  *
 229  * \tparam[in] T        Raw data type
 230  * \param[out] texObj   texture object to initialize
 231  * \param[in]  d_ptr    pointer to device global memory to bind \p texObj to
 232  * \param[in]  sizeInBytes  size of memory area to bind \p texObj to
 233  */
 234 template <typename T>
 235 static void setup1DTexture(cudaTextureObject_t &texObj,
 236                            void                *d_ptr,
 237                            size_t               sizeInBytes)
 238 {
 239     assert(!c_disableCudaTextures);
 240
 241     cudaError_t      stat;
 242     cudaResourceDesc rd;
 243     cudaTextureDesc  td;
 244
 245     memset(&rd, 0, sizeof(rd));
 246     rd.resType                = cudaResourceTypeLinear;
 247     rd.res.linear.devPtr      = d_ptr;
 248     rd.res.linear.desc        = cudaCreateChannelDesc<T>();
 249     rd.res.linear.sizeInBytes = sizeInBytes;
 250
 251     memset(&td, 0, sizeof(td));
 252     td.readMode                 = cudaReadModeElementType;
 253     stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
 254     CU_RET_ERR(stat, "cudaCreateTextureObject failed");
 255 }
 256
 257 /*! \brief Set up texture reference for an array of type T.
 258  *
 259  * Set up texture object for an array of type T and bind it to the device memory
 260  * \p d_ptr points to.
 261  *
 262  * \tparam[in] T        Raw data type
 263  * \param[out] texObj   texture reference to initialize
 264  * \param[in]  d_ptr    pointer to device global memory to bind \p texObj to
 265  * \param[in]  sizeInBytes  size of memory area to bind \p texObj to
 266  */
 267 template <typename T>
 268 static void setup1DTexture(const struct texture<T, 1, cudaReadModeElementType> *texRef,
 269                            const void                                          *d_ptr,
 270                            size_t                                              sizeInBytes)
 271 {
 272     assert(!c_disableCudaTextures);
 273
 274     cudaError_t           stat;
 275     cudaChannelFormatDesc cd;
 276
 277     cd   = cudaCreateChannelDesc<T>();
 278     stat = cudaBindTexture(nullptr, texRef, d_ptr, &cd, sizeInBytes);
 279     CU_RET_ERR(stat, "cudaBindTexture failed");
 280 }
 281
 282 template <typename T>
 283 void initParamLookupTable(T                        * &d_ptr,
 284                           cudaTextureObject_t       &texObj,
 285                           const struct texture<T, 1, cudaReadModeElementType> *texRef,
 286                           const T                   *h_ptr,
 287                           int                        numElem,
 288                           const gmx_device_info_t   *devInfo)
 289 {
 290     const size_t sizeInBytes = numElem * sizeof(*d_ptr);
 291     cudaError_t  stat        = cudaMalloc((void **)&d_ptr, sizeInBytes);
 292     CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
 293     cu_copy_H2D_sync(d_ptr, (void *)h_ptr, sizeInBytes);
 294
 295     if (!c_disableCudaTextures)
 296     {
 297         if (use_texobj(devInfo))
 298         {
 299             setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
 300         }
 301         else
 302         {
 303             setup1DTexture<T>(texRef, d_ptr, sizeInBytes);
 304         }
 305     }
 306 }
 307
 308 template <typename T>
 309 void destroyParamLookupTable(T                       *d_ptr,
 310                              cudaTextureObject_t      texObj,
 311                              const struct texture<T, 1, cudaReadModeElementType> *texRef,
 312                              const gmx_device_info_t *devInfo)
 313 {
 314     if (!c_disableCudaTextures)
 315     {
 316         if (use_texobj(devInfo))
 317         {
 318             CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
 319         }
 320         else
 321         {
 322             CU_RET_ERR(cudaUnbindTexture(texRef), "cudaUnbindTexture on texRef failed");
 323         }
 324     }
 325     CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
 326 }
 327
 328 /*! \brief Add explicit instantiations of init/destroyParamLookupTable() here as needed.
 329  * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
 330  * looks reasonable, when instantiating the templates for new types - just in case.
 331  */
 332 template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const texture<float, 1, cudaReadModeElementType> *, const float *, int, const gmx_device_info_t *);
 333 template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const texture<float, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
 334 template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const texture<int, 1, cudaReadModeElementType> *, const int *, int, const gmx_device_info_t *);
 335 template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const texture<int, 1, cudaReadModeElementType> *, const gmx_device_info_t *);