src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include <assert.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gpu_utils/cudautils.cuh"
  52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  53 #include "gromacs/hardware/gpu_hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/exceptions.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/programcontext.h"
  60 #include "gromacs/utility/smalloc.h"
  61 #include "gromacs/utility/snprintf.h"
  62 #include "gromacs/utility/stringutil.h"
  63
  64 /*! \internal \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static int  cuda_max_device_count = 32;
  70
  71 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
  72
  73 /** Dummy kernel used for sanity checking. */
  74 static __global__ void k_dummy_test(void)
  75 {
  76 }
  77
  78 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
  79 {
  80     assert(devInfo);
  81
  82     cudaFuncAttributes attributes;
  83     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  84
  85     if (cudaErrorInvalidDeviceFunction == stat)
  86     {
  87         gmx_fatal(FARGS,
  88                   "The %s binary does not include support for the CUDA architecture "
  89                   "of the selected GPU (device ID #%d, compute capability %d.%d). "
  90                   "By default, GROMACS supports all architectures of compute "
  91                   "capability >= 3.0, so your GPU "
  92                   "might be rare, or some architectures were disabled in the build. "
  93                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  94                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
  95                   gmx::getProgramContext().displayName(), devInfo->id,
  96                   devInfo->prop.major, devInfo->prop.minor);
  97     }
  98
  99     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 100 }
 101
 102 bool isHostMemoryPinned(const void *h_ptr)
 103 {
 104     cudaPointerAttributes memoryAttributes;
 105     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 106
 107     bool                  result = false;
 108     switch (stat)
 109     {
 110         case cudaSuccess:
 111             result = true;
 112             break;
 113
 114         case cudaErrorInvalidValue:
 115             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 116             result = false;
 117             // Reset the last error status
 118             cudaGetLastError();
 119             break;
 120
 121         default:
 122             CU_RET_ERR(stat, "Unexpected CUDA error");
 123     }
 124     return result;
 125 }
 126
 127 /*!
 128  * \brief Runs GPU sanity checks.
 129  *
 130  * Runs a series of checks to determine that the given GPU and underlying CUDA
 131  * driver/runtime functions properly.
 132  * Returns properties of a device with given ID or the one that has
 133  * already been initialized earlier in the case if of \dev_id == -1.
 134  *
 135  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 136  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 137  * \returns                0 if the device looks OK
 138  *
 139  * TODO: introduce errors codes and handle errors more smoothly.
 140  */
 141 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 142 {
 143     cudaError_t cu_err;
 144     int         dev_count, id;
 145
 146     cu_err = cudaGetDeviceCount(&dev_count);
 147     if (cu_err != cudaSuccess)
 148     {
 149         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 150                 cudaGetErrorString(cu_err));
 151         return -1;
 152     }
 153
 154     /* no CUDA compatible device at all */
 155     if (dev_count == 0)
 156     {
 157         return -1;
 158     }
 159
 160     /* things might go horribly wrong if cudart is not compatible with the driver */
 161     if (dev_count < 0 || dev_count > cuda_max_device_count)
 162     {
 163         return -1;
 164     }
 165
 166     if (dev_id == -1) /* device already selected let's not destroy the context */
 167     {
 168         cu_err = cudaGetDevice(&id);
 169         if (cu_err != cudaSuccess)
 170         {
 171             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 172                     cudaGetErrorString(cu_err));
 173             return -1;
 174         }
 175     }
 176     else
 177     {
 178         id = dev_id;
 179         if (id > dev_count - 1) /* pfff there's no such device */
 180         {
 181             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 182                     dev_id, dev_count);
 183             return -1;
 184         }
 185     }
 186
 187     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 188     cu_err = cudaGetDeviceProperties(dev_prop, id);
 189     if (cu_err != cudaSuccess)
 190     {
 191         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 192                 cudaGetErrorString(cu_err));
 193         return -1;
 194     }
 195
 196     /* both major & minor is 9999 if no CUDA capable devices are present */
 197     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 198     {
 199         return -1;
 200     }
 201     /* we don't care about emulation mode */
 202     if (dev_prop->major == 0)
 203     {
 204         return -1;
 205     }
 206
 207     if (id != -1)
 208     {
 209         cu_err = cudaSetDevice(id);
 210         if (cu_err != cudaSuccess)
 211         {
 212             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 213                     cu_err, id, cudaGetErrorString(cu_err));
 214             return -1;
 215         }
 216     }
 217
 218     /* try to execute a dummy kernel */
 219     KernelLaunchConfig config;
 220     config.blockSize[0] = 512;
 221     const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 222     launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 223     if (cudaDeviceSynchronize() != cudaSuccess)
 224     {
 225         return -1;
 226     }
 227
 228     /* destroy context if we created one */
 229     if (id != -1)
 230     {
 231         cu_err = cudaDeviceReset();
 232         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 233     }
 234
 235     return 0;
 236 }
 237
 238 void init_gpu(const gmx_device_info_t *deviceInfo)
 239 {
 240     cudaError_t stat;
 241
 242     assert(deviceInfo);
 243
 244     stat = cudaSetDevice(deviceInfo->id);
 245     if (stat != cudaSuccess)
 246     {
 247         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 248         CU_RET_ERR(stat, message.c_str());
 249     }
 250
 251     if (debug)
 252     {
 253         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 254     }
 255
 256     checkCompiledTargetCompatibility(deviceInfo);
 257 }
 258
 259 void free_gpu(const gmx_device_info_t *deviceInfo)
 260 {
 261     // One should only attempt to clear the device context when
 262     // it has been used, but currently the only way to know that a GPU
 263     // device was used is that deviceInfo will be non-null.
 264     if (deviceInfo == nullptr)
 265     {
 266         return;
 267     }
 268
 269     cudaError_t  stat;
 270
 271     if (debug)
 272     {
 273         int gpuid;
 274         stat = cudaGetDevice(&gpuid);
 275         CU_RET_ERR(stat, "cudaGetDevice failed");
 276         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 277     }
 278
 279     stat = cudaDeviceReset();
 280     if (stat != cudaSuccess)
 281     {
 282         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 283     }
 284 }
 285
 286 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 287                                  int                   deviceId)
 288 {
 289     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 290     {
 291         gmx_incons("Invalid GPU deviceId requested");
 292     }
 293     return &gpu_info.gpu_dev[deviceId];
 294 }
 295
 296 /*! \brief Returns true if the gpu characterized by the device properties is
 297  *  supported by the native gpu acceleration.
 298  *
 299  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 300  * \returns             true if the GPU properties passed indicate a compatible
 301  *                      GPU, otherwise false.
 302  */
 303 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 304 {
 305     return (dev_prop->major >= 3);
 306 }
 307
 308 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 309  *
 310  *  Returns a status value which indicates compatibility or one of the following
 311  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 312  *  It also returns the respective device's properties in \dev_prop (if applicable).
 313  *
 314  *  As the error handling only permits returning the state of the GPU, this function
 315  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 316  *  upon return. Note that this also means it is the caller's responsibility to
 317  *  reset the CUDA runtime state.
 318  *
 319  *  \param[in]  dev_id   the ID of the GPU to check.
 320  *  \param[out] dev_prop the CUDA device properties of the device checked.
 321  *  \returns             the status of the requested device
 322  */
 323 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 324 {
 325     cudaError_t stat;
 326     int         ndev;
 327
 328     stat = cudaGetDeviceCount(&ndev);
 329     if (stat != cudaSuccess)
 330     {
 331         return egpuInsane;
 332     }
 333
 334     if (dev_id > ndev - 1)
 335     {
 336         return egpuNonexistent;
 337     }
 338
 339     /* TODO: currently we do not make a distinction between the type of errors
 340      * that can appear during sanity checks. This needs to be improved, e.g if
 341      * the dummy test kernel fails to execute with a "device busy message" we
 342      * should appropriately report that the device is busy instead of insane.
 343      */
 344     if (do_sanity_checks(dev_id, dev_prop) == 0)
 345     {
 346         if (is_gmx_supported_gpu(dev_prop))
 347         {
 348             return egpuCompatible;
 349         }
 350         else
 351         {
 352             return egpuIncompatible;
 353         }
 354     }
 355     else
 356     {
 357         return egpuInsane;
 358     }
 359 }
 360
 361 bool canDetectGpus(std::string *errorMessage)
 362 {
 363     cudaError_t        stat;
 364     int                driverVersion = -1;
 365     stat = cudaDriverGetVersion(&driverVersion);
 366     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 367     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 368                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 369                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 370     bool foundDriver = (driverVersion > 0);
 371     if (!foundDriver)
 372     {
 373         // Can't detect GPUs if there is no driver
 374         if (errorMessage != nullptr)
 375         {
 376             errorMessage->assign("No valid CUDA driver found");
 377         }
 378         return false;
 379     }
 380
 381     int numDevices;
 382     stat = cudaGetDeviceCount(&numDevices);
 383     if (stat != cudaSuccess)
 384     {
 385         if (errorMessage != nullptr)
 386         {
 387             /* cudaGetDeviceCount failed which means that there is
 388              * something wrong with the machine: driver-runtime
 389              * mismatch, all GPUs being busy in exclusive mode,
 390              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 391              * which should result in GROMACS issuing at least a
 392              * warning. */
 393             errorMessage->assign(cudaGetErrorString(stat));
 394         }
 395
 396         // Consume the error now that we have prepared to handle
 397         // it. This stops it reappearing next time we check for
 398         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 399         // valid devices, then cudaGetLastError returns the
 400         // (undocumented) cudaErrorNoDevice, but this should not be a
 401         // problem as there should be no future CUDA API calls.
 402         // NVIDIA bug report #2038718 has been filed.
 403         cudaGetLastError();
 404         // Can't detect GPUs
 405         return false;
 406     }
 407
 408     // We don't actually use numDevices here, that's not the job of
 409     // this function.
 410     return true;
 411 }
 412
 413 void findGpus(gmx_gpu_info_t *gpu_info)
 414 {
 415     assert(gpu_info);
 416
 417     gpu_info->n_dev_compatible = 0;
 418
 419     int         ndev;
 420     cudaError_t stat = cudaGetDeviceCount(&ndev);
 421     if (stat != cudaSuccess)
 422     {
 423         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 424                                      "canDetectGpus() was not called appropriately beforehand."));
 425     }
 426
 427     // We expect to start device support/sanity checks with a clean runtime error state
 428     gmx::ensureNoPendingCudaError("");
 429
 430     gmx_device_info_t *devs;
 431     snew(devs, ndev);
 432     for (int i = 0; i < ndev; i++)
 433     {
 434         cudaDeviceProp prop;
 435         int            checkres = is_gmx_supported_gpu_id(i, &prop);
 436
 437         devs[i].id   = i;
 438         devs[i].prop = prop;
 439         devs[i].stat = checkres;
 440
 441         if (checkres == egpuCompatible)
 442         {
 443             gpu_info->n_dev_compatible++;
 444         }
 445         else
 446         {
 447             // TODO:
 448             //  - we inspect the CUDA API state to retrieve and record any
 449             //    errors that occurred during is_gmx_supported_gpu_id() here,
 450             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 451             //    and only return a string with the error if one was encountered.
 452             //  - we'll be reporting without rank information which is not ideal.
 453             //  - we'll end up warning also in cases where users would already
 454             //    get an error before mdrun aborts.
 455             //
 456             // Here we also clear the CUDA API error state so potential
 457             // errors during sanity checks don't propagate.
 458             if ((stat = cudaGetLastError()) != cudaSuccess)
 459             {
 460                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 461                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 462             }
 463         }
 464     }
 465
 466     stat = cudaPeekAtLastError();
 467     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 468                        gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
 469                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 470
 471     gpu_info->n_dev   = ndev;
 472     gpu_info->gpu_dev = devs;
 473 }
 474
 475 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 476 {
 477     assert(s);
 478
 479     if (index < 0 && index >= gpu_info.n_dev)
 480     {
 481         return;
 482     }
 483
 484     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 485
 486     bool               bGpuExists = (dinfo->stat != egpuNonexistent &&
 487                                      dinfo->stat != egpuInsane);
 488
 489     if (!bGpuExists)
 490     {
 491         sprintf(s, "#%d: %s, stat: %s",
 492                 dinfo->id, "N/A",
 493                 gpu_detect_res_str[dinfo->stat]);
 494     }
 495     else
 496     {
 497         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 498                 dinfo->id, dinfo->prop.name,
 499                 dinfo->prop.major, dinfo->prop.minor,
 500                 dinfo->prop.ECCEnabled ? "yes" : " no",
 501                 gpu_detect_res_str[dinfo->stat]);
 502     }
 503 }
 504
 505 int get_current_cuda_gpu_device_id(void)
 506 {
 507     int gpuid;
 508     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 509
 510     return gpuid;
 511 }
 512
 513 size_t sizeof_gpu_dev_info(void)
 514 {
 515     return sizeof(gmx_device_info_t);
 516 }
 517
 518 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 519                                   gmx_host_alloc_t **nb_alloc,
 520                                   gmx_host_free_t  **nb_free)
 521 {
 522     if (bUseGpuKernels)
 523     {
 524         *nb_alloc = &pmalloc;
 525         *nb_free  = &pfree;
 526     }
 527     else
 528     {
 529         *nb_alloc = nullptr;
 530         *nb_free  = nullptr;
 531     }
 532 }
 533
 534 void startGpuProfiler(void)
 535 {
 536     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 537        mdrun is executed in the CUDA profiler.
 538        If nvprof was run is with "--profile-from-start off", the profiler will
 539        be started here. This way we can avoid tracing the CUDA events from the
 540        first part of the run. Starting the profiler again does nothing.
 541      */
 542     if (cudaProfilerRun)
 543     {
 544         cudaError_t stat;
 545         stat = cudaProfilerStart();
 546         CU_RET_ERR(stat, "cudaProfilerStart failed");
 547     }
 548 }
 549
 550 void stopGpuProfiler(void)
 551 {
 552     /* Stopping the nvidia here allows us to eliminate the subsequent
 553        API calls from the trace, e.g. uninitialization and cleanup. */
 554     if (cudaProfilerRun)
 555     {
 556         cudaError_t stat;
 557         stat = cudaProfilerStop();
 558         CU_RET_ERR(stat, "cudaProfilerStop failed");
 559     }
 560 }
 561
 562 void resetGpuProfiler(void)
 563 {
 564     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 565      *  the profiling here (can't stop it) which will achieve the desired effect if
 566      *  the run was started with the profiling disabled.
 567      *
 568      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 569      * stopGpuProfiler();
 570      */
 571     if (cudaProfilerRun)
 572     {
 573         startGpuProfiler();
 574     }
 575 }
 576
 577 int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
 578 {
 579     return info.gpu_dev[index].stat;
 580 }