src/gromacs/gpu_utils/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "cudautils.cuh"
  39
  40 #include <cassert>
  41 #include <cstdlib>
  42
  43 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  44 #include "gromacs/gpu_utils/gpu_utils.h"
  45 #include "gromacs/utility/gmxassert.h"
  46 #include "gromacs/utility/smalloc.h"
  47
  48 /*** Generic CUDA data operation wrappers ***/
  49
  50 // TODO: template on transferKind to avoid runtime conditionals
  51 int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
  52                 GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  53 {
  54     cudaError_t stat;
  55
  56     if (h_dest == NULL || d_src == NULL || bytes == 0)
  57     {
  58         return -1;
  59     }
  60
  61     switch (transferKind)
  62     {
  63         case GpuApiCallBehavior::Async:
  64             GMX_ASSERT(isHostMemoryPinned(h_dest), "Destination buffer was not pinned for CUDA");
  65             stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  66             CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  67             break;
  68
  69         case GpuApiCallBehavior::Sync:
  70             stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  71             CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  72             break;
  73
  74         default:
  75             throw;
  76     }
  77
  78     return 0;
  79 }
  80
  81 int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  82 {
  83     return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
  84 }
  85
  86 /*!
  87  *  The copy is launched in stream s or if not specified, in stream 0.
  88  */
  89 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  90 {
  91     return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
  92 }
  93
  94 // TODO: template on transferKind to avoid runtime conditionals
  95 int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
  96                 GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  97 {
  98     cudaError_t stat;
  99
 100     if (d_dest == NULL || h_src == NULL || bytes == 0)
 101     {
 102         return -1;
 103     }
 104
 105     switch (transferKind)
 106     {
 107         case GpuApiCallBehavior::Async:
 108             GMX_ASSERT(isHostMemoryPinned(h_src), "Source buffer was not pinned for CUDA");
 109             stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 110             CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 111             break;
 112
 113         case GpuApiCallBehavior::Sync:
 114             stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 115             CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 116             break;
 117
 118         default:
 119             throw;
 120     }
 121
 122     return 0;
 123 }
 124
 125 int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
 126 {
 127     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
 128 }
 129
 130 /*!
 131  *  The copy is launched in stream s or if not specified, in stream 0.
 132  */
 133 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 134 {
 135     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 136 }
 137
 138 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 139
 140 /*!
 141  * If the pointers to the size variables are NULL no resetting happens.
 142  */
 143 void cu_free_buffered(void *d_ptr, int *n, int *nalloc)
 144 {
 145     cudaError_t stat;
 146
 147     if (d_ptr)
 148     {
 149         stat = cudaFree(d_ptr);
 150         CU_RET_ERR(stat, "cudaFree failed");
 151     }
 152
 153     if (n)
 154     {
 155         *n = -1;
 156     }
 157
 158     if (nalloc)
 159     {
 160         *nalloc = -1;
 161     }
 162 }
 163
 164 /*!
 165  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 166  *  the location pointed by h_src host-side pointer is done. Allocation is
 167  *  buffered and therefore freeing is only needed if the previously allocated
 168  *  space is not enough.
 169  *  The H2D copy is launched in stream s and can be done synchronously or
 170  *  asynchronously (the default is the latter).
 171  */
 172 void cu_realloc_buffered(void **d_dest, void *h_src,
 173                          size_t type_size,
 174                          int *curr_size, int *curr_alloc_size,
 175                          int req_size,
 176                          cudaStream_t s,
 177                          bool bAsync = true)
 178 {
 179     cudaError_t stat;
 180
 181     if (d_dest == NULL || req_size < 0)
 182     {
 183         return;
 184     }
 185
 186     /* reallocate only if the data does not fit = allocation size is smaller
 187        than the current requested size */
 188     if (req_size > *curr_alloc_size)
 189     {
 190         /* only free if the array has already been initialized */
 191         if (*curr_alloc_size >= 0)
 192         {
 193             cu_free_buffered(*d_dest, curr_size, curr_alloc_size);
 194         }
 195
 196         *curr_alloc_size = over_alloc_large(req_size);
 197
 198         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 199         CU_RET_ERR(stat, "cudaMalloc failed in cu_free_buffered");
 200     }
 201
 202     /* size could have changed without actual reallocation */
 203     *curr_size = req_size;
 204
 205     /* upload to device */
 206     if (h_src)
 207     {
 208         if (bAsync)
 209         {
 210             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 211         }
 212         else
 213         {
 214             cu_copy_H2D_sync(*d_dest, h_src,  *curr_size * type_size);
 215         }
 216     }
 217 }
 218
 219 /*! \brief Return whether texture objects are used on this device.
 220  *
 221  * \param[in]   pointer to the GPU device info structure to inspect for texture objects support
 222  * \return      true if texture objects are used on this device
 223  */
 224 static inline bool use_texobj(const gmx_device_info_t *dev_info)
 225 {
 226     assert(!c_disableCudaTextures);
 227     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 228     return (dev_info->prop.major >= 3);
 229 }
 230
 231 /*! \brief Set up texture object for an array of type T.
 232  *
 233  * Set up texture object for an array of type T and bind it to the device memory
 234  * \p d_ptr points to.
 235  *
 236  * \tparam[in] T        Raw data type
 237  * \param[out] texObj   texture object to initialize
 238  * \param[in]  d_ptr    pointer to device global memory to bind \p texObj to
 239  * \param[in]  sizeInBytes  size of memory area to bind \p texObj to
 240  */
 241 template <typename T>
 242 static void setup1DTexture(cudaTextureObject_t &texObj,
 243                            void                *d_ptr,
 244                            size_t               sizeInBytes)
 245 {
 246     assert(!c_disableCudaTextures);
 247
 248     cudaError_t      stat;
 249     cudaResourceDesc rd;
 250     cudaTextureDesc  td;
 251
 252     memset(&rd, 0, sizeof(rd));
 253     rd.resType                = cudaResourceTypeLinear;
 254     rd.res.linear.devPtr      = d_ptr;
 255     rd.res.linear.desc        = cudaCreateChannelDesc<T>();
 256     rd.res.linear.sizeInBytes = sizeInBytes;
 257
 258     memset(&td, 0, sizeof(td));
 259     td.readMode                 = cudaReadModeElementType;
 260     stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
 261     CU_RET_ERR(stat, "cudaCreateTextureObject failed");
 262 }
 263
 264 template <typename T>
 265 void initParamLookupTable(T                        * &d_ptr,
 266                           cudaTextureObject_t        &texObj,
 267                           const T                    *h_ptr,
 268                           int                         numElem,
 269                           const gmx_device_info_t    *devInfo)
 270 {
 271     const size_t sizeInBytes = numElem * sizeof(*d_ptr);
 272     cudaError_t  stat        = cudaMalloc((void **)&d_ptr, sizeInBytes);
 273     CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
 274     cu_copy_H2D_sync(d_ptr, (void *)h_ptr, sizeInBytes);
 275
 276     if (!c_disableCudaTextures)
 277     {
 278         if (use_texobj(devInfo))
 279         {
 280             setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
 281         }
 282     }
 283 }
 284
 285 template <typename T>
 286 void destroyParamLookupTable(T                       *d_ptr,
 287                              cudaTextureObject_t      texObj,
 288                              const gmx_device_info_t *devInfo)
 289 {
 290     if (!c_disableCudaTextures)
 291     {
 292         if (use_texobj(devInfo))
 293         {
 294             CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
 295         }
 296     }
 297     CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
 298 }
 299
 300 /*! \brief Add explicit instantiations of init/destroyParamLookupTable() here as needed.
 301  * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
 302  * looks reasonable, when instantiating the templates for new types - just in case.
 303  */
 304 template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
 305 template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
 306 template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
 307 template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);