src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include <assert.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gpu_utils/cudautils.cuh"
  52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  53 #include "gromacs/hardware/gpu_hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/exceptions.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/programcontext.h"
  60 #include "gromacs/utility/smalloc.h"
  61 #include "gromacs/utility/snprintf.h"
  62 #include "gromacs/utility/stringutil.h"
  63
  64 /*! \internal \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static int  cuda_max_device_count = 32;
  70
  71 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
  72
  73 /** Dummy kernel used for sanity checking. */
  74 static __global__ void k_dummy_test(void)
  75 {
  76 }
  77
  78 static cudaError_t checkCompiledTargetCompatibility(int                   deviceId,
  79                                                     const cudaDeviceProp &deviceProp)
  80 {
  81     cudaFuncAttributes attributes;
  82     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  83
  84     if (cudaErrorInvalidDeviceFunction == stat)
  85     {
  86         fprintf(stderr,
  87                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  88                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  89                 "By default, GROMACS supports all architectures of compute "
  90                 "capability >= 3.0, so your GPU "
  91                 "might be rare, or some architectures were disabled in the build. \n"
  92                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  93                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  94                 gmx::getProgramContext().displayName(), deviceId,
  95                 deviceProp.major, deviceProp.minor);
  96     }
  97
  98     return stat;
  99 }
 100
 101 bool isHostMemoryPinned(const void *h_ptr)
 102 {
 103     cudaPointerAttributes memoryAttributes;
 104     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 105
 106     bool                  result = false;
 107     switch (stat)
 108     {
 109         case cudaSuccess:
 110             result = true;
 111             break;
 112
 113         case cudaErrorInvalidValue:
 114             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 115             result = false;
 116             // Reset the last error status
 117             cudaGetLastError();
 118             break;
 119
 120         default:
 121             CU_RET_ERR(stat, "Unexpected CUDA error");
 122     }
 123     return result;
 124 }
 125
 126 /*!
 127  * \brief Runs GPU sanity checks.
 128  *
 129  * Runs a series of checks to determine that the given GPU and underlying CUDA
 130  * driver/runtime functions properly.
 131  *
 132  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 133  * \param[in]  dev_prop    The device properties structure
 134  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 135  *
 136  * TODO: introduce errors codes and handle errors more smoothly.
 137  */
 138 static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
 139 {
 140     cudaError_t cu_err;
 141     int         dev_count, id;
 142
 143     cu_err = cudaGetDeviceCount(&dev_count);
 144     if (cu_err != cudaSuccess)
 145     {
 146         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 147                 cudaGetErrorString(cu_err));
 148         return -1;
 149     }
 150
 151     /* no CUDA compatible device at all */
 152     if (dev_count == 0)
 153     {
 154         return -1;
 155     }
 156
 157     /* things might go horribly wrong if cudart is not compatible with the driver */
 158     if (dev_count < 0 || dev_count > cuda_max_device_count)
 159     {
 160         return -1;
 161     }
 162
 163     if (dev_id == -1) /* device already selected let's not destroy the context */
 164     {
 165         cu_err = cudaGetDevice(&id);
 166         if (cu_err != cudaSuccess)
 167         {
 168             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 169                     cudaGetErrorString(cu_err));
 170             return -1;
 171         }
 172     }
 173     else
 174     {
 175         id = dev_id;
 176         if (id > dev_count - 1) /* pfff there's no such device */
 177         {
 178             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 179                     dev_id, dev_count);
 180             return -1;
 181         }
 182     }
 183
 184     /* both major & minor is 9999 if no CUDA capable devices are present */
 185     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 186     {
 187         return -1;
 188     }
 189     /* we don't care about emulation mode */
 190     if (dev_prop.major == 0)
 191     {
 192         return -1;
 193     }
 194
 195     if (id != -1)
 196     {
 197         cu_err = cudaSetDevice(id);
 198         if (cu_err != cudaSuccess)
 199         {
 200             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 201                     cu_err, id, cudaGetErrorString(cu_err));
 202             return -1;
 203         }
 204     }
 205
 206     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 207     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 208     // it is enough to check for cudaErrorDevicesUnavailable only here because
 209     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 210     if (cu_err == cudaErrorDevicesUnavailable)
 211     {
 212         return -2;
 213     }
 214     else if (cu_err != cudaSuccess)
 215     {
 216         return -1;
 217     }
 218
 219     /* try to execute a dummy kernel */
 220     try
 221     {
 222         KernelLaunchConfig config;
 223         config.blockSize[0] = 512;
 224         const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 225         launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 226     }
 227     catch (gmx::GromacsException &ex)
 228     {
 229         // launchGpuKernel error is not fatal and should continue with marking the device bad
 230         fprintf(stderr, "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
 231                 id, formatExceptionMessageToString(ex).c_str());
 232         return -1;
 233     }
 234
 235     if (cudaDeviceSynchronize() != cudaSuccess)
 236     {
 237         return -1;
 238     }
 239
 240     /* destroy context if we created one */
 241     if (id != -1)
 242     {
 243         cu_err = cudaDeviceReset();
 244         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 245     }
 246
 247     return 0;
 248 }
 249
 250 void init_gpu(const gmx_device_info_t *deviceInfo)
 251 {
 252     cudaError_t stat;
 253
 254     assert(deviceInfo);
 255
 256     stat = cudaSetDevice(deviceInfo->id);
 257     if (stat != cudaSuccess)
 258     {
 259         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 260         CU_RET_ERR(stat, message.c_str());
 261     }
 262
 263     if (debug)
 264     {
 265         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 266     }
 267 }
 268
 269 void free_gpu(const gmx_device_info_t *deviceInfo)
 270 {
 271     // One should only attempt to clear the device context when
 272     // it has been used, but currently the only way to know that a GPU
 273     // device was used is that deviceInfo will be non-null.
 274     if (deviceInfo == nullptr)
 275     {
 276         return;
 277     }
 278
 279     cudaError_t  stat;
 280
 281     if (debug)
 282     {
 283         int gpuid;
 284         stat = cudaGetDevice(&gpuid);
 285         CU_RET_ERR(stat, "cudaGetDevice failed");
 286         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 287     }
 288
 289     stat = cudaDeviceReset();
 290     if (stat != cudaSuccess)
 291     {
 292         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 293     }
 294 }
 295
 296 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 297                                  int                   deviceId)
 298 {
 299     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 300     {
 301         gmx_incons("Invalid GPU deviceId requested");
 302     }
 303     return &gpu_info.gpu_dev[deviceId];
 304 }
 305
 306 /*! \brief Returns true if the gpu characterized by the device properties is
 307  *  supported by the native gpu acceleration.
 308  *
 309  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 310  * \returns             true if the GPU properties passed indicate a compatible
 311  *                      GPU, otherwise false.
 312  */
 313 static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
 314 {
 315     return (dev_prop.major >= 3);
 316 }
 317
 318 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 319  *
 320  *  Returns a status value which indicates compatibility or one of the following
 321  *  errors: incompatibility or insanity (=unexpected behavior).
 322  *
 323  *  As the error handling only permits returning the state of the GPU, this function
 324  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 325  *  upon return. Note that this also means it is the caller's responsibility to
 326  *  reset the CUDA runtime state.
 327  *
 328  *  \param[in]  deviceId   the ID of the GPU to check.
 329  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 330  *  \returns               the status of the requested device
 331  */
 332 static int is_gmx_supported_gpu_id(int                   deviceId,
 333                                    const cudaDeviceProp &deviceProp)
 334 {
 335     if (!is_gmx_supported_gpu(deviceProp))
 336     {
 337         return egpuIncompatible;
 338     }
 339
 340     /* TODO: currently we do not make a distinction between the type of errors
 341      * that can appear during sanity checks. This needs to be improved, e.g if
 342      * the dummy test kernel fails to execute with a "device busy message" we
 343      * should appropriately report that the device is busy instead of insane.
 344      */
 345     const int checkResult = do_sanity_checks(deviceId, deviceProp);
 346     switch (checkResult)
 347     {
 348         case  0: return egpuCompatible;
 349         case -1: return egpuInsane;
 350         case -2: return egpuUnavailable;
 351         default: GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
 352             return egpuCompatible;
 353     }
 354 }
 355
 356 bool canDetectGpus(std::string *errorMessage)
 357 {
 358     cudaError_t        stat;
 359     int                driverVersion = -1;
 360     stat = cudaDriverGetVersion(&driverVersion);
 361     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 362     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 363                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 364                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 365     bool foundDriver = (driverVersion > 0);
 366     if (!foundDriver)
 367     {
 368         // Can't detect GPUs if there is no driver
 369         if (errorMessage != nullptr)
 370         {
 371             errorMessage->assign("No valid CUDA driver found");
 372         }
 373         return false;
 374     }
 375
 376     int numDevices;
 377     stat = cudaGetDeviceCount(&numDevices);
 378     if (stat != cudaSuccess)
 379     {
 380         if (errorMessage != nullptr)
 381         {
 382             /* cudaGetDeviceCount failed which means that there is
 383              * something wrong with the machine: driver-runtime
 384              * mismatch, all GPUs being busy in exclusive mode,
 385              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 386              * which should result in GROMACS issuing at least a
 387              * warning. */
 388             errorMessage->assign(cudaGetErrorString(stat));
 389         }
 390
 391         // Consume the error now that we have prepared to handle
 392         // it. This stops it reappearing next time we check for
 393         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 394         // valid devices, then cudaGetLastError returns the
 395         // (undocumented) cudaErrorNoDevice, but this should not be a
 396         // problem as there should be no future CUDA API calls.
 397         // NVIDIA bug report #2038718 has been filed.
 398         cudaGetLastError();
 399         // Can't detect GPUs
 400         return false;
 401     }
 402
 403     // We don't actually use numDevices here, that's not the job of
 404     // this function.
 405     return true;
 406 }
 407
 408 void findGpus(gmx_gpu_info_t *gpu_info)
 409 {
 410     assert(gpu_info);
 411
 412     gpu_info->n_dev_compatible = 0;
 413
 414     int         ndev;
 415     cudaError_t stat = cudaGetDeviceCount(&ndev);
 416     if (stat != cudaSuccess)
 417     {
 418         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 419                                      "canDetectGpus() was not called appropriately beforehand."));
 420     }
 421
 422     // We expect to start device support/sanity checks with a clean runtime error state
 423     gmx::ensureNoPendingCudaError("");
 424
 425     gmx_device_info_t *devs;
 426     snew(devs, ndev);
 427     for (int i = 0; i < ndev; i++)
 428     {
 429         cudaDeviceProp prop;
 430         memset(&prop, 0, sizeof(cudaDeviceProp));
 431         stat = cudaGetDeviceProperties(&prop, i);
 432         int checkResult;
 433         if (stat != cudaSuccess)
 434         {
 435             // Will handle the error reporting below
 436             checkResult = egpuInsane;
 437         }
 438         else
 439         {
 440             checkResult = is_gmx_supported_gpu_id(i, prop);
 441         }
 442
 443         devs[i].id   = i;
 444         devs[i].prop = prop;
 445         devs[i].stat = checkResult;
 446
 447         if (checkResult == egpuCompatible)
 448         {
 449             gpu_info->n_dev_compatible++;
 450         }
 451         else
 452         {
 453             // TODO:
 454             //  - we inspect the CUDA API state to retrieve and record any
 455             //    errors that occurred during is_gmx_supported_gpu_id() here,
 456             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 457             //    and only return a string with the error if one was encountered.
 458             //  - we'll be reporting without rank information which is not ideal.
 459             //  - we'll end up warning also in cases where users would already
 460             //    get an error before mdrun aborts.
 461             //
 462             // Here we also clear the CUDA API error state so potential
 463             // errors during sanity checks don't propagate.
 464             if ((stat = cudaGetLastError()) != cudaSuccess)
 465             {
 466                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 467                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 468             }
 469         }
 470     }
 471
 472     stat = cudaPeekAtLastError();
 473     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 474                        gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
 475                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 476
 477     gpu_info->n_dev   = ndev;
 478     gpu_info->gpu_dev = devs;
 479 }
 480
 481 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 482 {
 483     assert(s);
 484
 485     if (index < 0 && index >= gpu_info.n_dev)
 486     {
 487         return;
 488     }
 489
 490     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 491
 492     bool               bGpuExists = (dinfo->stat != egpuNonexistent &&
 493                                      dinfo->stat != egpuInsane);
 494
 495     if (!bGpuExists)
 496     {
 497         sprintf(s, "#%d: %s, stat: %s",
 498                 dinfo->id, "N/A",
 499                 gpu_detect_res_str[dinfo->stat]);
 500     }
 501     else
 502     {
 503         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 504                 dinfo->id, dinfo->prop.name,
 505                 dinfo->prop.major, dinfo->prop.minor,
 506                 dinfo->prop.ECCEnabled ? "yes" : " no",
 507                 gpu_detect_res_str[dinfo->stat]);
 508     }
 509 }
 510
 511 int get_current_cuda_gpu_device_id(void)
 512 {
 513     int gpuid;
 514     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 515
 516     return gpuid;
 517 }
 518
 519 size_t sizeof_gpu_dev_info(void)
 520 {
 521     return sizeof(gmx_device_info_t);
 522 }
 523
 524 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 525                                   gmx_host_alloc_t **nb_alloc,
 526                                   gmx_host_free_t  **nb_free)
 527 {
 528     if (bUseGpuKernels)
 529     {
 530         *nb_alloc = &pmalloc;
 531         *nb_free  = &pfree;
 532     }
 533     else
 534     {
 535         *nb_alloc = nullptr;
 536         *nb_free  = nullptr;
 537     }
 538 }
 539
 540 void startGpuProfiler(void)
 541 {
 542     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 543        mdrun is executed in the CUDA profiler.
 544        If nvprof was run is with "--profile-from-start off", the profiler will
 545        be started here. This way we can avoid tracing the CUDA events from the
 546        first part of the run. Starting the profiler again does nothing.
 547      */
 548     if (cudaProfilerRun)
 549     {
 550         cudaError_t stat;
 551         stat = cudaProfilerStart();
 552         CU_RET_ERR(stat, "cudaProfilerStart failed");
 553     }
 554 }
 555
 556 void stopGpuProfiler(void)
 557 {
 558     /* Stopping the nvidia here allows us to eliminate the subsequent
 559        API calls from the trace, e.g. uninitialization and cleanup. */
 560     if (cudaProfilerRun)
 561     {
 562         cudaError_t stat;
 563         stat = cudaProfilerStop();
 564         CU_RET_ERR(stat, "cudaProfilerStop failed");
 565     }
 566 }
 567
 568 void resetGpuProfiler(void)
 569 {
 570     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 571      *  the profiling here (can't stop it) which will achieve the desired effect if
 572      *  the run was started with the profiling disabled.
 573      *
 574      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 575      * stopGpuProfiler();
 576      */
 577     if (cudaProfilerRun)
 578     {
 579         startGpuProfiler();
 580     }
 581 }
 582
 583 int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
 584 {
 585     return info.gpu_dev[index].stat;
 586 }