src/gromacs/gpu_utils/cudautils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "cudautils.cuh"
  39
  40 #include <cassert>
  41 #include <cstdlib>
  42
  43 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  44 #include "gromacs/gpu_utils/devicebuffer.h" //TODO remove when removing cu_realloc_buffered
  45 #include "gromacs/gpu_utils/gpu_utils.h"
  46 #include "gromacs/utility/gmxassert.h"
  47 #include "gromacs/utility/smalloc.h"
  48
  49 /*** Generic CUDA data operation wrappers ***/
  50
  51 // TODO: template on transferKind to avoid runtime conditionals
  52 int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
  53                 GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  54 {
  55     cudaError_t stat;
  56
  57     if (h_dest == NULL || d_src == NULL || bytes == 0)
  58     {
  59         return -1;
  60     }
  61
  62     switch (transferKind)
  63     {
  64         case GpuApiCallBehavior::Async:
  65             GMX_ASSERT(isHostMemoryPinned(h_dest), "Destination buffer was not pinned for CUDA");
  66             stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
  67             CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
  68             break;
  69
  70         case GpuApiCallBehavior::Sync:
  71             stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
  72             CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
  73             break;
  74
  75         default:
  76             throw;
  77     }
  78
  79     return 0;
  80 }
  81
  82 int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  83 {
  84     return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
  85 }
  86
  87 /*!
  88  *  The copy is launched in stream s or if not specified, in stream 0.
  89  */
  90 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  91 {
  92     return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
  93 }
  94
  95 // TODO: template on transferKind to avoid runtime conditionals
  96 int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
  97                 GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  98 {
  99     cudaError_t stat;
 100
 101     if (d_dest == NULL || h_src == NULL || bytes == 0)
 102     {
 103         return -1;
 104     }
 105
 106     switch (transferKind)
 107     {
 108         case GpuApiCallBehavior::Async:
 109             GMX_ASSERT(isHostMemoryPinned(h_src), "Source buffer was not pinned for CUDA");
 110             stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
 111             CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
 112             break;
 113
 114         case GpuApiCallBehavior::Sync:
 115             stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
 116             CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
 117             break;
 118
 119         default:
 120             throw;
 121     }
 122
 123     return 0;
 124 }
 125
 126 int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
 127 {
 128     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
 129 }
 130
 131 /*!
 132  *  The copy is launched in stream s or if not specified, in stream 0.
 133  */
 134 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 135 {
 136     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 137 }
 138
 139 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
 140
 141 /*!
 142  *  Reallocation of the memory pointed by d_ptr and copying of the data from
 143  *  the location pointed by h_src host-side pointer is done. Allocation is
 144  *  buffered and therefore freeing is only needed if the previously allocated
 145  *  space is not enough.
 146  *  The H2D copy is launched in stream s and can be done synchronously or
 147  *  asynchronously (the default is the latter).
 148  */
 149 void cu_realloc_buffered(void **d_dest, void *h_src,
 150                          size_t type_size,
 151                          int *curr_size, int *curr_alloc_size,
 152                          int req_size,
 153                          cudaStream_t s,
 154                          bool bAsync = true)
 155 {
 156     cudaError_t stat;
 157
 158     if (d_dest == NULL || req_size < 0)
 159     {
 160         return;
 161     }
 162
 163     /* reallocate only if the data does not fit = allocation size is smaller
 164        than the current requested size */
 165     if (req_size > *curr_alloc_size)
 166     {
 167         /* only free if the array has already been initialized */
 168         if (*curr_alloc_size >= 0)
 169         {
 170             freeDeviceBuffer(d_dest);
 171         }
 172
 173         *curr_alloc_size = over_alloc_large(req_size);
 174
 175         stat = cudaMalloc(d_dest, *curr_alloc_size * type_size);
 176         CU_RET_ERR(stat, "cudaMalloc failed in cu_realloc_buffered");
 177     }
 178
 179     /* size could have changed without actual reallocation */
 180     *curr_size = req_size;
 181
 182     /* upload to device */
 183     if (h_src)
 184     {
 185         if (bAsync)
 186         {
 187             cu_copy_H2D_async(*d_dest, h_src, *curr_size * type_size, s);
 188         }
 189         else
 190         {
 191             cu_copy_H2D_sync(*d_dest, h_src,  *curr_size * type_size);
 192         }
 193     }
 194 }
 195
 196 /*! \brief Return whether texture objects are used on this device.
 197  *
 198  * \param[in]   pointer to the GPU device info structure to inspect for texture objects support
 199  * \return      true if texture objects are used on this device
 200  */
 201 static inline bool use_texobj(const gmx_device_info_t *dev_info)
 202 {
 203     assert(!c_disableCudaTextures);
 204     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 205     return (dev_info->prop.major >= 3);
 206 }
 207
 208 /*! \brief Set up texture object for an array of type T.
 209  *
 210  * Set up texture object for an array of type T and bind it to the device memory
 211  * \p d_ptr points to.
 212  *
 213  * \tparam[in] T        Raw data type
 214  * \param[out] texObj   texture object to initialize
 215  * \param[in]  d_ptr    pointer to device global memory to bind \p texObj to
 216  * \param[in]  sizeInBytes  size of memory area to bind \p texObj to
 217  */
 218 template <typename T>
 219 static void setup1DTexture(cudaTextureObject_t &texObj,
 220                            void                *d_ptr,
 221                            size_t               sizeInBytes)
 222 {
 223     assert(!c_disableCudaTextures);
 224
 225     cudaError_t      stat;
 226     cudaResourceDesc rd;
 227     cudaTextureDesc  td;
 228
 229     memset(&rd, 0, sizeof(rd));
 230     rd.resType                = cudaResourceTypeLinear;
 231     rd.res.linear.devPtr      = d_ptr;
 232     rd.res.linear.desc        = cudaCreateChannelDesc<T>();
 233     rd.res.linear.sizeInBytes = sizeInBytes;
 234
 235     memset(&td, 0, sizeof(td));
 236     td.readMode                 = cudaReadModeElementType;
 237     stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
 238     CU_RET_ERR(stat, "cudaCreateTextureObject failed");
 239 }
 240
 241 template <typename T>
 242 void initParamLookupTable(T                        * &d_ptr,
 243                           cudaTextureObject_t        &texObj,
 244                           const T                    *h_ptr,
 245                           int                         numElem,
 246                           const gmx_device_info_t    *devInfo)
 247 {
 248     const size_t sizeInBytes = numElem * sizeof(*d_ptr);
 249     cudaError_t  stat        = cudaMalloc((void **)&d_ptr, sizeInBytes);
 250     CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
 251     cu_copy_H2D_sync(d_ptr, (void *)h_ptr, sizeInBytes);
 252
 253     if (!c_disableCudaTextures)
 254     {
 255         if (use_texobj(devInfo))
 256         {
 257             setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
 258         }
 259     }
 260 }
 261
 262 template <typename T>
 263 void destroyParamLookupTable(T                       *d_ptr,
 264                              cudaTextureObject_t      texObj,
 265                              const gmx_device_info_t *devInfo)
 266 {
 267     if (!c_disableCudaTextures)
 268     {
 269         if (use_texobj(devInfo))
 270         {
 271             CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
 272         }
 273     }
 274     CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
 275 }
 276
 277 /*! \brief Add explicit instantiations of init/destroyParamLookupTable() here as needed.
 278  * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
 279  * looks reasonable, when instantiating the templates for new types - just in case.
 280  */
 281 template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
 282 template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
 283 template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
 284 template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);