src/gromacs/gpu_utils/oclutils.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \libinternal \file
  36  *  \brief Declare utility routines for OpenCL
  37  *
  38  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  39  *  \inlibraryapi
  40  */
  41 #ifndef GMX_GPU_UTILS_OCLUTILS_H
  42 #define GMX_GPU_UTILS_OCLUTILS_H
  43
  44 #include <string>
  45
  46 #include "gromacs/gpu_utils/gmxopencl.h"
  47 #include "gromacs/gpu_utils/gputraits_ocl.h"
  48 #include "gromacs/utility/exceptions.h"
  49 #include "gromacs/utility/gmxassert.h"
  50
  51 enum class GpuApiCallBehavior;
  52
  53 /*! \brief OpenCL vendor IDs */
  54 typedef enum {
  55     OCL_VENDOR_NVIDIA = 0,
  56     OCL_VENDOR_AMD,
  57     OCL_VENDOR_INTEL,
  58     OCL_VENDOR_UNKNOWN
  59 } ocl_vendor_id_t;
  60
  61 /*! \internal
  62  * \brief OpenCL GPU device identificator
  63  *
  64  * An OpenCL device is identified by its ID.
  65  * The platform ID is also included for caching reasons.
  66  */
  67 typedef struct
  68 {
  69     cl_platform_id      ocl_platform_id; /**< Platform ID */
  70     cl_device_id        ocl_device_id;   /**< Device ID */
  71 } ocl_gpu_id_t;
  72
  73 /*! \internal
  74  * \brief OpenCL device information.
  75  *
  76  * The OpenCL device information is queried and set at detection and contains
  77  * both information about the device/hardware returned by the runtime as well
  78  * as additional data like support status.
  79  */
  80 struct gmx_device_info_t
  81 {
  82     ocl_gpu_id_t        ocl_gpu_id;          /**< device ID assigned at detection   */
  83     char                device_name[256];    /**< device name */
  84     char                device_version[256]; /**< device version */
  85     char                device_vendor[256];  /**< device vendor */
  86     int                 compute_units;       /**< number of compute units */
  87     int                 adress_bits;         /**< number of adress bits the device is capable of */
  88     int                 stat;                /**< device status takes values of e_gpu_detect_res_t */
  89     ocl_vendor_id_t     vendor_e;            /**< device vendor as defined by ocl_vendor_id_t */
  90     size_t              maxWorkItemSizes[3]; /**< workgroup size limits (CL_DEVICE_MAX_WORK_ITEM_SIZES) */
  91     size_t              maxWorkGroupSize;    /**< workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE) */
  92 };
  93
  94 /*! \internal
  95  * \brief OpenCL GPU runtime data
  96  *
  97  * The device runtime data is meant to hold objects associated with a GROMACS rank's
  98  * (thread or process) use of a single device (multiple devices per rank is not
  99  * implemented). These objects should be constructed at ther point where a device
 100  * dets assigned to a rank and released at when this assignment is no longer valid
 101  * (i.e. at cleanup in the current implementation).
 102  *
 103  */
 104 struct gmx_device_runtime_data_t
 105 {
 106     cl_context context; /**< OpenCL context */
 107     cl_program program; /**< OpenCL program */
 108 };
 109
 110 /*! \brief Launches synchronous or asynchronous device to host memory copy.
 111  *
 112  *  If copy_event is not NULL, on return it will contain an event object
 113  *  identifying this particular device to host operation. The event can further
 114  *  be used to queue a wait for this operation or to query profiling information.
 115  */
 116 int ocl_copy_D2H(void * h_dest, cl_mem d_src,
 117                  size_t offset, size_t bytes,
 118                  GpuApiCallBehavior transferKind,
 119                  cl_command_queue command_queue,
 120                  cl_event *copy_event);
 121
 122
 123 /*! \brief Launches asynchronous device to host memory copy. */
 124 int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
 125                        size_t offset, size_t bytes,
 126                        cl_command_queue command_queue,
 127                        cl_event *copy_event);
 128
 129 /*! \brief Launches synchronous or asynchronous host to device memory copy.
 130  *
 131  *  If copy_event is not NULL, on return it will contain an event object
 132  *  identifying this particular host to device operation. The event can further
 133  *  be used to queue a wait for this operation or to query profiling information.
 134  */
 135 int ocl_copy_H2D(cl_mem d_dest, const void* h_src,
 136                  size_t offset, size_t bytes,
 137                  GpuApiCallBehavior transferKind,
 138                  cl_command_queue command_queue,
 139                  cl_event *copy_event);
 140
 141 /*! \brief Launches asynchronous host to device memory copy. */
 142 int ocl_copy_H2D_async(cl_mem d_dest, const void * h_src,
 143                        size_t offset, size_t bytes,
 144                        cl_command_queue command_queue,
 145                        cl_event *copy_event);
 146
 147 /*! \brief Launches synchronous host to device memory copy. */
 148 int ocl_copy_H2D_sync(cl_mem d_dest, const void * h_src,
 149                       size_t offset, size_t bytes,
 150                       cl_command_queue command_queue);
 151
 152 /*! \brief Allocate host memory in malloc style */
 153 void pmalloc(void **h_ptr, size_t nbytes);
 154
 155 /*! \brief Free host memory in malloc style */
 156 void pfree(void *h_ptr);
 157
 158 /*! \brief Convert error code to diagnostic string */
 159 std::string ocl_get_error_string(cl_int error);
 160
 161 /*! \brief Calls clFinish() in the stream \p s.
 162  *
 163  * \param[in] s stream to synchronize with
 164  */
 165 static inline void gpuStreamSynchronize(cl_command_queue s)
 166 {
 167     cl_int cl_error = clFinish(s);
 168     GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
 169                        ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
 170 }
 171
 172 //! A debug checker to track cl_events being released correctly
 173 inline void ensureReferenceCount(const cl_event &event, unsigned int refCount)
 174 {
 175 #ifndef NDEBUG
 176     cl_int clError = clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(refCount), &refCount, nullptr);
 177     GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
 178     GMX_ASSERT(refCount == refCount, "Unexpected reference count");
 179 #else
 180     GMX_UNUSED_VALUE(event);
 181     GMX_UNUSED_VALUE(refCount);
 182 #endif
 183 }
 184
 185 /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
 186  *
 187  * \param[in] s queue to check
 188  *
 189  *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
 190  */
 191 static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
 192 {
 193     GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
 194     return false;
 195 }
 196
 197 /* Kernel launch helpers */
 198
 199 /*! \brief
 200  * A function for setting up a single OpenCL kernel argument.
 201  * This is the tail of the compile-time recursive function below.
 202  * It has to be seen by the compiler first.
 203  * As NB kernels might be using dynamic local memory as the last argument,
 204  * this function also manages that, using sharedMemorySize from \p config.
 205  *
 206  * \param[in]     kernel          Kernel function handle
 207  * \param[in]     config          Kernel configuration for launching
 208  * \param[in]     argIndex        Index of the current argument
 209  */
 210 void inline prepareGpuKernelArgument(cl_kernel                 kernel,
 211                                      const KernelLaunchConfig &config,
 212                                      size_t                    argIndex)
 213 {
 214     if (config.sharedMemorySize > 0)
 215     {
 216         cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, config.sharedMemorySize, nullptr);
 217         GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
 218     }
 219 }
 220
 221 /*! \brief
 222  * Compile-time recursive function for setting up a single OpenCL kernel argument.
 223  * This function uses one kernel argument pointer \p argPtr to call clSetKernelArg(),
 224  * and calls itself on the next argument, eventually calling the tail function above.
 225  *
 226  * \tparam        CurrentArg      Type of the current argument
 227  * \tparam        RemainingArgs   Types of remaining arguments after the current one
 228  * \param[in]     kernel          Kernel function handle
 229  * \param[in]     config          Kernel configuration for launching
 230  * \param[in]     argIndex        Index of the current argument
 231  * \param[in]     argPtr          Pointer to the current argument
 232  * \param[in]     otherArgsPtrs   Pack of pointers to arguments remaining to process after the current one
 233  */
 234 template <typename CurrentArg, typename ... RemainingArgs>
 235 void prepareGpuKernelArgument(cl_kernel                 kernel,
 236                               const KernelLaunchConfig &config,
 237                               size_t                    argIndex,
 238                               const CurrentArg         *argPtr,
 239                               const RemainingArgs *...  otherArgsPtrs)
 240 {
 241     cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, sizeof(CurrentArg), argPtr);
 242     GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
 243
 244     prepareGpuKernelArgument(kernel, config, argIndex + 1, otherArgsPtrs ...);
 245 }
 246
 247 /*! \brief
 248  * A wrapper function for setting up all the OpenCL kernel arguments.
 249  * Calls the recursive functions above.
 250  *
 251  * \tparam    Args            Types of all the kernel arguments
 252  * \param[in] kernel          Kernel function handle
 253  * \param[in] config          Kernel configuration for launching
 254  * \param[in] argsPtrs        Pointers to all the kernel arguments
 255  * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument
 256  * - currently always nullptr for OpenCL, as it manages kernel/arguments association by itself.
 257  */
 258 template <typename ... Args>
 259 void *prepareGpuKernelArguments(cl_kernel                 kernel,
 260                                 const KernelLaunchConfig &config,
 261                                 const Args *...           argsPtrs)
 262 {
 263     prepareGpuKernelArgument(kernel, config, 0, argsPtrs ...);
 264     return nullptr;
 265 }
 266
 267 /*! \brief Launches the OpenCL kernel and handles the errors.
 268  *
 269  * \param[in] kernel          Kernel function handle
 270  * \param[in] config          Kernel configuration for launching
 271  * \param[in] timingEvent     Timing event, fetched from GpuRegionTimer
 272  * \param[in] kernelName      Human readable kernel description, for error handling only
 273  * \throws gmx::InternalError on kernel launch failure
 274  */
 275 inline void launchGpuKernel(cl_kernel                 kernel,
 276                             const KernelLaunchConfig &config,
 277                             CommandEvent             *timingEvent,
 278                             const char               *kernelName,
 279                             const void                * /*kernelArgs*/)
 280 {
 281     const int       workDimensions    = 3;
 282     const size_t   *globalWorkOffset  = nullptr;
 283     const size_t    waitListSize      = 0;
 284     const cl_event *waitList          = nullptr;
 285     size_t          globalWorkSize[3];
 286     for (int i = 0; i < workDimensions; i++)
 287     {
 288         globalWorkSize[i] = config.gridSize[i] * config.blockSize[i];
 289     }
 290     cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset,
 291                                             globalWorkSize, config.blockSize, waitListSize, waitList, timingEvent);
 292     if (CL_SUCCESS != clError)
 293     {
 294         const std::string errorMessage = "GPU kernel (" +  std::string(kernelName) +
 295             ") failed to launch: " + ocl_get_error_string(clError);
 296         GMX_THROW(gmx::InternalError(errorMessage));
 297     }
 298 }
 299
 300 #endif