src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include <assert.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gpu_utils/cudautils.cuh"
  52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  53 #include "gromacs/hardware/gpu_hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/exceptions.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/programcontext.h"
  60 #include "gromacs/utility/smalloc.h"
  61 #include "gromacs/utility/snprintf.h"
  62 #include "gromacs/utility/stringutil.h"
  63
  64 /*! \internal \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static int  cuda_max_device_count = 32;
  70
  71 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
  72
  73 /** Dummy kernel used for sanity checking. */
  74 static __global__ void k_dummy_test(void)
  75 {
  76 }
  77
  78 static void checkCompiledTargetCompatibility(int                   deviceId,
  79                                              const cudaDeviceProp &deviceProp)
  80 {
  81     cudaFuncAttributes attributes;
  82     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  83
  84     if (cudaErrorInvalidDeviceFunction == stat)
  85     {
  86         gmx_fatal(FARGS,
  87                   "The %s binary does not include support for the CUDA architecture of a "
  88                   "detected GPU: %s, ID #%d (compute capability %d.%d). "
  89                   "By default, GROMACS supports all architectures of compute "
  90                   "capability >= 3.0, so your GPU "
  91                   "might be rare, or some architectures were disabled in the build. "
  92                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  93                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. "
  94                   "To work around this error, use the CUDA_VISIBLE_DEVICES environment"
  95                   "variable to pass a list of GPUs that excludes the ID %d.",
  96                   gmx::getProgramContext().displayName(), deviceProp.name, deviceId,
  97                   deviceProp.major, deviceProp.minor, deviceId);
  98     }
  99
 100     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 101 }
 102
 103 bool isHostMemoryPinned(const void *h_ptr)
 104 {
 105     cudaPointerAttributes memoryAttributes;
 106     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 107
 108     bool                  result = false;
 109     switch (stat)
 110     {
 111         case cudaSuccess:
 112             result = true;
 113             break;
 114
 115         case cudaErrorInvalidValue:
 116             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 117             result = false;
 118             // Reset the last error status
 119             cudaGetLastError();
 120             break;
 121
 122         default:
 123             CU_RET_ERR(stat, "Unexpected CUDA error");
 124     }
 125     return result;
 126 }
 127
 128 /*!
 129  * \brief Runs GPU sanity checks.
 130  *
 131  * Runs a series of checks to determine that the given GPU and underlying CUDA
 132  * driver/runtime functions properly.
 133  *
 134  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 135  * \param[in]  dev_prop    The device properties structure
 136  * \returns                0 if the device looks OK
 137  *
 138  * TODO: introduce errors codes and handle errors more smoothly.
 139  */
 140 static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
 141 {
 142     cudaError_t cu_err;
 143     int         dev_count, id;
 144
 145     cu_err = cudaGetDeviceCount(&dev_count);
 146     if (cu_err != cudaSuccess)
 147     {
 148         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 149                 cudaGetErrorString(cu_err));
 150         return -1;
 151     }
 152
 153     /* no CUDA compatible device at all */
 154     if (dev_count == 0)
 155     {
 156         return -1;
 157     }
 158
 159     /* things might go horribly wrong if cudart is not compatible with the driver */
 160     if (dev_count < 0 || dev_count > cuda_max_device_count)
 161     {
 162         return -1;
 163     }
 164
 165     if (dev_id == -1) /* device already selected let's not destroy the context */
 166     {
 167         cu_err = cudaGetDevice(&id);
 168         if (cu_err != cudaSuccess)
 169         {
 170             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 171                     cudaGetErrorString(cu_err));
 172             return -1;
 173         }
 174     }
 175     else
 176     {
 177         id = dev_id;
 178         if (id > dev_count - 1) /* pfff there's no such device */
 179         {
 180             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 181                     dev_id, dev_count);
 182             return -1;
 183         }
 184     }
 185
 186     /* both major & minor is 9999 if no CUDA capable devices are present */
 187     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 188     {
 189         return -1;
 190     }
 191     /* we don't care about emulation mode */
 192     if (dev_prop.major == 0)
 193     {
 194         return -1;
 195     }
 196
 197     if (id != -1)
 198     {
 199         cu_err = cudaSetDevice(id);
 200         if (cu_err != cudaSuccess)
 201         {
 202             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 203                     cu_err, id, cudaGetErrorString(cu_err));
 204             return -1;
 205         }
 206     }
 207
 208     /* try to execute a dummy kernel */
 209     checkCompiledTargetCompatibility(dev_id, dev_prop);
 210
 211     KernelLaunchConfig config;
 212     config.blockSize[0] = 512;
 213     const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 214     launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 215     if (cudaDeviceSynchronize() != cudaSuccess)
 216     {
 217         return -1;
 218     }
 219
 220     /* destroy context if we created one */
 221     if (id != -1)
 222     {
 223         cu_err = cudaDeviceReset();
 224         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 225     }
 226
 227     return 0;
 228 }
 229
 230 void init_gpu(const gmx_device_info_t *deviceInfo)
 231 {
 232     cudaError_t stat;
 233
 234     assert(deviceInfo);
 235
 236     stat = cudaSetDevice(deviceInfo->id);
 237     if (stat != cudaSuccess)
 238     {
 239         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 240         CU_RET_ERR(stat, message.c_str());
 241     }
 242
 243     if (debug)
 244     {
 245         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 246     }
 247 }
 248
 249 void free_gpu(const gmx_device_info_t *deviceInfo)
 250 {
 251     // One should only attempt to clear the device context when
 252     // it has been used, but currently the only way to know that a GPU
 253     // device was used is that deviceInfo will be non-null.
 254     if (deviceInfo == nullptr)
 255     {
 256         return;
 257     }
 258
 259     cudaError_t  stat;
 260
 261     if (debug)
 262     {
 263         int gpuid;
 264         stat = cudaGetDevice(&gpuid);
 265         CU_RET_ERR(stat, "cudaGetDevice failed");
 266         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 267     }
 268
 269     stat = cudaDeviceReset();
 270     if (stat != cudaSuccess)
 271     {
 272         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 273     }
 274 }
 275
 276 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 277                                  int                   deviceId)
 278 {
 279     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 280     {
 281         gmx_incons("Invalid GPU deviceId requested");
 282     }
 283     return &gpu_info.gpu_dev[deviceId];
 284 }
 285
 286 /*! \brief Returns true if the gpu characterized by the device properties is
 287  *  supported by the native gpu acceleration.
 288  *
 289  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 290  * \returns             true if the GPU properties passed indicate a compatible
 291  *                      GPU, otherwise false.
 292  */
 293 static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
 294 {
 295     return (dev_prop.major >= 3);
 296 }
 297
 298 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 299  *
 300  *  Returns a status value which indicates compatibility or one of the following
 301  *  errors: incompatibility or insanity (=unexpected behavior).
 302  *
 303  *  As the error handling only permits returning the state of the GPU, this function
 304  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 305  *  upon return. Note that this also means it is the caller's responsibility to
 306  *  reset the CUDA runtime state.
 307  *
 308  *  \param[in]  deviceId   the ID of the GPU to check.
 309  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 310  *  \returns               the status of the requested device
 311  */
 312 static int is_gmx_supported_gpu_id(int                   deviceId,
 313                                    const cudaDeviceProp &deviceProp)
 314 {
 315     if (!is_gmx_supported_gpu(deviceProp))
 316     {
 317         return egpuIncompatible;
 318     }
 319
 320     /* TODO: currently we do not make a distinction between the type of errors
 321      * that can appear during sanity checks. This needs to be improved, e.g if
 322      * the dummy test kernel fails to execute with a "device busy message" we
 323      * should appropriately report that the device is busy instead of insane.
 324      */
 325     if (do_sanity_checks(deviceId, deviceProp) != 0)
 326     {
 327         return egpuInsane;
 328     }
 329
 330     return egpuCompatible;
 331 }
 332
 333 bool canDetectGpus(std::string *errorMessage)
 334 {
 335     cudaError_t        stat;
 336     int                driverVersion = -1;
 337     stat = cudaDriverGetVersion(&driverVersion);
 338     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 339     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 340                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 341                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 342     bool foundDriver = (driverVersion > 0);
 343     if (!foundDriver)
 344     {
 345         // Can't detect GPUs if there is no driver
 346         if (errorMessage != nullptr)
 347         {
 348             errorMessage->assign("No valid CUDA driver found");
 349         }
 350         return false;
 351     }
 352
 353     int numDevices;
 354     stat = cudaGetDeviceCount(&numDevices);
 355     if (stat != cudaSuccess)
 356     {
 357         if (errorMessage != nullptr)
 358         {
 359             /* cudaGetDeviceCount failed which means that there is
 360              * something wrong with the machine: driver-runtime
 361              * mismatch, all GPUs being busy in exclusive mode,
 362              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 363              * which should result in GROMACS issuing at least a
 364              * warning. */
 365             errorMessage->assign(cudaGetErrorString(stat));
 366         }
 367
 368         // Consume the error now that we have prepared to handle
 369         // it. This stops it reappearing next time we check for
 370         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 371         // valid devices, then cudaGetLastError returns the
 372         // (undocumented) cudaErrorNoDevice, but this should not be a
 373         // problem as there should be no future CUDA API calls.
 374         // NVIDIA bug report #2038718 has been filed.
 375         cudaGetLastError();
 376         // Can't detect GPUs
 377         return false;
 378     }
 379
 380     // We don't actually use numDevices here, that's not the job of
 381     // this function.
 382     return true;
 383 }
 384
 385 void findGpus(gmx_gpu_info_t *gpu_info)
 386 {
 387     assert(gpu_info);
 388
 389     gpu_info->n_dev_compatible = 0;
 390
 391     int         ndev;
 392     cudaError_t stat = cudaGetDeviceCount(&ndev);
 393     if (stat != cudaSuccess)
 394     {
 395         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 396                                      "canDetectGpus() was not called appropriately beforehand."));
 397     }
 398
 399     // We expect to start device support/sanity checks with a clean runtime error state
 400     gmx::ensureNoPendingCudaError("");
 401
 402     gmx_device_info_t *devs;
 403     snew(devs, ndev);
 404     for (int i = 0; i < ndev; i++)
 405     {
 406         cudaDeviceProp prop;
 407         memset(&prop, 0, sizeof(cudaDeviceProp));
 408         stat = cudaGetDeviceProperties(&prop, i);
 409         int checkResult;
 410         if (stat != cudaSuccess)
 411         {
 412             // Will handle the error reporting below
 413             checkResult = egpuInsane;
 414         }
 415         else
 416         {
 417             checkResult = is_gmx_supported_gpu_id(i, prop);
 418         }
 419
 420         devs[i].id   = i;
 421         devs[i].prop = prop;
 422         devs[i].stat = checkResult;
 423
 424         if (checkResult == egpuCompatible)
 425         {
 426             gpu_info->n_dev_compatible++;
 427         }
 428         else
 429         {
 430             // TODO:
 431             //  - we inspect the CUDA API state to retrieve and record any
 432             //    errors that occurred during is_gmx_supported_gpu_id() here,
 433             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 434             //    and only return a string with the error if one was encountered.
 435             //  - we'll be reporting without rank information which is not ideal.
 436             //  - we'll end up warning also in cases where users would already
 437             //    get an error before mdrun aborts.
 438             //
 439             // Here we also clear the CUDA API error state so potential
 440             // errors during sanity checks don't propagate.
 441             if ((stat = cudaGetLastError()) != cudaSuccess)
 442             {
 443                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 444                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 445             }
 446         }
 447     }
 448
 449     stat = cudaPeekAtLastError();
 450     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 451                        gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
 452                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 453
 454     gpu_info->n_dev   = ndev;
 455     gpu_info->gpu_dev = devs;
 456 }
 457
 458 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 459 {
 460     assert(s);
 461
 462     if (index < 0 && index >= gpu_info.n_dev)
 463     {
 464         return;
 465     }
 466
 467     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 468
 469     bool               bGpuExists = (dinfo->stat != egpuNonexistent &&
 470                                      dinfo->stat != egpuInsane);
 471
 472     if (!bGpuExists)
 473     {
 474         sprintf(s, "#%d: %s, stat: %s",
 475                 dinfo->id, "N/A",
 476                 gpu_detect_res_str[dinfo->stat]);
 477     }
 478     else
 479     {
 480         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 481                 dinfo->id, dinfo->prop.name,
 482                 dinfo->prop.major, dinfo->prop.minor,
 483                 dinfo->prop.ECCEnabled ? "yes" : " no",
 484                 gpu_detect_res_str[dinfo->stat]);
 485     }
 486 }
 487
 488 int get_current_cuda_gpu_device_id(void)
 489 {
 490     int gpuid;
 491     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 492
 493     return gpuid;
 494 }
 495
 496 size_t sizeof_gpu_dev_info(void)
 497 {
 498     return sizeof(gmx_device_info_t);
 499 }
 500
 501 void startGpuProfiler(void)
 502 {
 503     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 504        mdrun is executed in the CUDA profiler.
 505        If nvprof was run is with "--profile-from-start off", the profiler will
 506        be started here. This way we can avoid tracing the CUDA events from the
 507        first part of the run. Starting the profiler again does nothing.
 508      */
 509     if (cudaProfilerRun)
 510     {
 511         cudaError_t stat;
 512         stat = cudaProfilerStart();
 513         CU_RET_ERR(stat, "cudaProfilerStart failed");
 514     }
 515 }
 516
 517 void stopGpuProfiler(void)
 518 {
 519     /* Stopping the nvidia here allows us to eliminate the subsequent
 520        API calls from the trace, e.g. uninitialization and cleanup. */
 521     if (cudaProfilerRun)
 522     {
 523         cudaError_t stat;
 524         stat = cudaProfilerStop();
 525         CU_RET_ERR(stat, "cudaProfilerStop failed");
 526     }
 527 }
 528
 529 void resetGpuProfiler(void)
 530 {
 531     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 532      *  the profiling here (can't stop it) which will achieve the desired effect if
 533      *  the run was started with the profiling disabled.
 534      *
 535      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 536      * stopGpuProfiler();
 537      */
 538     if (cudaProfilerRun)
 539     {
 540         startGpuProfiler();
 541     }
 542 }
 543
 544 int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
 545 {
 546     return info.gpu_dev[index].stat;
 547 }