src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include <assert.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gpu_utils/cudautils.cuh"
  52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  53 #include "gromacs/hardware/gpu_hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/exceptions.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/gmxassert.h"
  59 #include "gromacs/utility/logger.h"
  60 #include "gromacs/utility/programcontext.h"
  61 #include "gromacs/utility/smalloc.h"
  62 #include "gromacs/utility/snprintf.h"
  63 #include "gromacs/utility/stringutil.h"
  64
  65 /*! \internal \brief
  66  * Max number of devices supported by CUDA (for consistency checking).
  67  *
  68  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  69  */
  70 static int  cuda_max_device_count = 32;
  71
  72 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
  73
  74 /** Dummy kernel used for sanity checking. */
  75 static __global__ void k_dummy_test(void)
  76 {
  77 }
  78
  79 static void checkCompiledTargetCompatibility(int                   deviceId,
  80                                              const cudaDeviceProp &deviceProp)
  81 {
  82     cudaFuncAttributes attributes;
  83     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  84
  85     if (cudaErrorInvalidDeviceFunction == stat)
  86     {
  87         gmx_fatal(FARGS,
  88                   "The %s binary does not include support for the CUDA architecture of a "
  89                   "detected GPU: %s, ID #%d (compute capability %d.%d). "
  90                   "By default, GROMACS supports all architectures of compute "
  91                   "capability >= 3.0, so your GPU "
  92                   "might be rare, or some architectures were disabled in the build. "
  93                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  94                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. "
  95                   "To work around this error, use the CUDA_VISIBLE_DEVICES environment"
  96                   "variable to pass a list of GPUs that excludes the ID %d.",
  97                   gmx::getProgramContext().displayName(), deviceProp.name, deviceId,
  98                   deviceProp.major, deviceProp.minor, deviceId);
  99     }
 100
 101     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 102 }
 103
 104 bool isHostMemoryPinned(const void *h_ptr)
 105 {
 106     cudaPointerAttributes memoryAttributes;
 107     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 108
 109     bool                  result = false;
 110     switch (stat)
 111     {
 112         case cudaSuccess:
 113             result = true;
 114             break;
 115
 116         case cudaErrorInvalidValue:
 117             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 118             result = false;
 119             // Reset the last error status
 120             cudaGetLastError();
 121             break;
 122
 123         default:
 124             CU_RET_ERR(stat, "Unexpected CUDA error");
 125     }
 126     return result;
 127 }
 128
 129 /*!
 130  * \brief Runs GPU sanity checks.
 131  *
 132  * Runs a series of checks to determine that the given GPU and underlying CUDA
 133  * driver/runtime functions properly.
 134  *
 135  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 136  * \param[in]  dev_prop    The device properties structure
 137  * \returns                0 if the device looks OK
 138  *
 139  * TODO: introduce errors codes and handle errors more smoothly.
 140  */
 141 static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
 142 {
 143     cudaError_t cu_err;
 144     int         dev_count, id;
 145
 146     cu_err = cudaGetDeviceCount(&dev_count);
 147     if (cu_err != cudaSuccess)
 148     {
 149         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 150                 cudaGetErrorString(cu_err));
 151         return -1;
 152     }
 153
 154     /* no CUDA compatible device at all */
 155     if (dev_count == 0)
 156     {
 157         return -1;
 158     }
 159
 160     /* things might go horribly wrong if cudart is not compatible with the driver */
 161     if (dev_count < 0 || dev_count > cuda_max_device_count)
 162     {
 163         return -1;
 164     }
 165
 166     if (dev_id == -1) /* device already selected let's not destroy the context */
 167     {
 168         cu_err = cudaGetDevice(&id);
 169         if (cu_err != cudaSuccess)
 170         {
 171             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 172                     cudaGetErrorString(cu_err));
 173             return -1;
 174         }
 175     }
 176     else
 177     {
 178         id = dev_id;
 179         if (id > dev_count - 1) /* pfff there's no such device */
 180         {
 181             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 182                     dev_id, dev_count);
 183             return -1;
 184         }
 185     }
 186
 187     /* both major & minor is 9999 if no CUDA capable devices are present */
 188     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 189     {
 190         return -1;
 191     }
 192     /* we don't care about emulation mode */
 193     if (dev_prop.major == 0)
 194     {
 195         return -1;
 196     }
 197
 198     if (id != -1)
 199     {
 200         cu_err = cudaSetDevice(id);
 201         if (cu_err != cudaSuccess)
 202         {
 203             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 204                     cu_err, id, cudaGetErrorString(cu_err));
 205             return -1;
 206         }
 207     }
 208
 209     /* try to execute a dummy kernel */
 210     checkCompiledTargetCompatibility(dev_id, dev_prop);
 211
 212     KernelLaunchConfig config;
 213     config.blockSize[0] = 512;
 214     const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 215     launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 216     if (cudaDeviceSynchronize() != cudaSuccess)
 217     {
 218         return -1;
 219     }
 220
 221     /* destroy context if we created one */
 222     if (id != -1)
 223     {
 224         cu_err = cudaDeviceReset();
 225         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 226     }
 227
 228     return 0;
 229 }
 230
 231 void init_gpu(const gmx_device_info_t *deviceInfo)
 232 {
 233     cudaError_t stat;
 234
 235     assert(deviceInfo);
 236
 237     stat = cudaSetDevice(deviceInfo->id);
 238     if (stat != cudaSuccess)
 239     {
 240         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 241         CU_RET_ERR(stat, message.c_str());
 242     }
 243
 244     if (debug)
 245     {
 246         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 247     }
 248 }
 249
 250 void free_gpu(const gmx_device_info_t *deviceInfo)
 251 {
 252     // One should only attempt to clear the device context when
 253     // it has been used, but currently the only way to know that a GPU
 254     // device was used is that deviceInfo will be non-null.
 255     if (deviceInfo == nullptr)
 256     {
 257         return;
 258     }
 259
 260     cudaError_t  stat;
 261
 262     if (debug)
 263     {
 264         int gpuid;
 265         stat = cudaGetDevice(&gpuid);
 266         CU_RET_ERR(stat, "cudaGetDevice failed");
 267         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 268     }
 269
 270     stat = cudaDeviceReset();
 271     if (stat != cudaSuccess)
 272     {
 273         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 274     }
 275 }
 276
 277 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 278                                  int                   deviceId)
 279 {
 280     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 281     {
 282         gmx_incons("Invalid GPU deviceId requested");
 283     }
 284     return &gpu_info.gpu_dev[deviceId];
 285 }
 286
 287 /*! \brief Returns true if the gpu characterized by the device properties is
 288  *  supported by the native gpu acceleration.
 289  *
 290  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 291  * \returns             true if the GPU properties passed indicate a compatible
 292  *                      GPU, otherwise false.
 293  */
 294 static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
 295 {
 296     return (dev_prop.major >= 3);
 297 }
 298
 299 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 300  *
 301  *  Returns a status value which indicates compatibility or one of the following
 302  *  errors: incompatibility or insanity (=unexpected behavior).
 303  *
 304  *  As the error handling only permits returning the state of the GPU, this function
 305  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 306  *  upon return. Note that this also means it is the caller's responsibility to
 307  *  reset the CUDA runtime state.
 308  *
 309  *  \param[in]  deviceId   the ID of the GPU to check.
 310  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 311  *  \returns               the status of the requested device
 312  */
 313 static int is_gmx_supported_gpu_id(int                   deviceId,
 314                                    const cudaDeviceProp &deviceProp)
 315 {
 316     if (!is_gmx_supported_gpu(deviceProp))
 317     {
 318         return egpuIncompatible;
 319     }
 320
 321     /* TODO: currently we do not make a distinction between the type of errors
 322      * that can appear during sanity checks. This needs to be improved, e.g if
 323      * the dummy test kernel fails to execute with a "device busy message" we
 324      * should appropriately report that the device is busy instead of insane.
 325      */
 326     if (do_sanity_checks(deviceId, deviceProp) != 0)
 327     {
 328         return egpuInsane;
 329     }
 330
 331     return egpuCompatible;
 332 }
 333
 334 bool isGpuDetectionFunctional(std::string *errorMessage)
 335 {
 336     cudaError_t        stat;
 337     int                driverVersion = -1;
 338     stat = cudaDriverGetVersion(&driverVersion);
 339     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 340     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 341                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 342                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 343     bool foundDriver = (driverVersion > 0);
 344     if (!foundDriver)
 345     {
 346         // Can't detect GPUs if there is no driver
 347         if (errorMessage != nullptr)
 348         {
 349             errorMessage->assign("No valid CUDA driver found");
 350         }
 351         return false;
 352     }
 353
 354     int numDevices;
 355     stat = cudaGetDeviceCount(&numDevices);
 356     if (stat != cudaSuccess)
 357     {
 358         if (errorMessage != nullptr)
 359         {
 360             /* cudaGetDeviceCount failed which means that there is
 361              * something wrong with the machine: driver-runtime
 362              * mismatch, all GPUs being busy in exclusive mode,
 363              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 364              * which should result in GROMACS issuing at least a
 365              * warning. */
 366             errorMessage->assign(cudaGetErrorString(stat));
 367         }
 368
 369         // Consume the error now that we have prepared to handle
 370         // it. This stops it reappearing next time we check for
 371         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 372         // valid devices, then cudaGetLastError returns the
 373         // (undocumented) cudaErrorNoDevice, but this should not be a
 374         // problem as there should be no future CUDA API calls.
 375         // NVIDIA bug report #2038718 has been filed.
 376         cudaGetLastError();
 377         // Can't detect GPUs
 378         return false;
 379     }
 380
 381     // We don't actually use numDevices here, that's not the job of
 382     // this function.
 383     return true;
 384 }
 385
 386 void findGpus(gmx_gpu_info_t *gpu_info)
 387 {
 388     assert(gpu_info);
 389
 390     gpu_info->n_dev_compatible = 0;
 391
 392     int         ndev;
 393     cudaError_t stat = cudaGetDeviceCount(&ndev);
 394     if (stat != cudaSuccess)
 395     {
 396         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 397                                      "canDetectGpus() was not called appropriately beforehand."));
 398     }
 399
 400     // We expect to start device support/sanity checks with a clean runtime error state
 401     gmx::ensureNoPendingCudaError("");
 402
 403     gmx_device_info_t *devs;
 404     snew(devs, ndev);
 405     for (int i = 0; i < ndev; i++)
 406     {
 407         cudaDeviceProp prop;
 408         memset(&prop, 0, sizeof(cudaDeviceProp));
 409         stat = cudaGetDeviceProperties(&prop, i);
 410         int checkResult;
 411         if (stat != cudaSuccess)
 412         {
 413             // Will handle the error reporting below
 414             checkResult = egpuInsane;
 415         }
 416         else
 417         {
 418             checkResult = is_gmx_supported_gpu_id(i, prop);
 419         }
 420
 421         devs[i].id   = i;
 422         devs[i].prop = prop;
 423         devs[i].stat = checkResult;
 424
 425         if (checkResult == egpuCompatible)
 426         {
 427             gpu_info->n_dev_compatible++;
 428         }
 429         else
 430         {
 431             // TODO:
 432             //  - we inspect the CUDA API state to retrieve and record any
 433             //    errors that occurred during is_gmx_supported_gpu_id() here,
 434             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 435             //    and only return a string with the error if one was encountered.
 436             //  - we'll be reporting without rank information which is not ideal.
 437             //  - we'll end up warning also in cases where users would already
 438             //    get an error before mdrun aborts.
 439             //
 440             // Here we also clear the CUDA API error state so potential
 441             // errors during sanity checks don't propagate.
 442             if ((stat = cudaGetLastError()) != cudaSuccess)
 443             {
 444                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 445                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 446             }
 447         }
 448     }
 449
 450     stat = cudaPeekAtLastError();
 451     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 452                        gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
 453                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 454
 455     gpu_info->n_dev   = ndev;
 456     gpu_info->gpu_dev = devs;
 457 }
 458
 459 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 460 {
 461     assert(s);
 462
 463     if (index < 0 && index >= gpu_info.n_dev)
 464     {
 465         return;
 466     }
 467
 468     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 469
 470     bool               bGpuExists = (dinfo->stat != egpuNonexistent &&
 471                                      dinfo->stat != egpuInsane);
 472
 473     if (!bGpuExists)
 474     {
 475         sprintf(s, "#%d: %s, stat: %s",
 476                 dinfo->id, "N/A",
 477                 gpu_detect_res_str[dinfo->stat]);
 478     }
 479     else
 480     {
 481         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 482                 dinfo->id, dinfo->prop.name,
 483                 dinfo->prop.major, dinfo->prop.minor,
 484                 dinfo->prop.ECCEnabled ? "yes" : " no",
 485                 gpu_detect_res_str[dinfo->stat]);
 486     }
 487 }
 488
 489 int get_current_cuda_gpu_device_id(void)
 490 {
 491     int gpuid;
 492     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 493
 494     return gpuid;
 495 }
 496
 497 size_t sizeof_gpu_dev_info(void)
 498 {
 499     return sizeof(gmx_device_info_t);
 500 }
 501
 502 void startGpuProfiler(void)
 503 {
 504     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 505        mdrun is executed in the CUDA profiler.
 506        If nvprof was run is with "--profile-from-start off", the profiler will
 507        be started here. This way we can avoid tracing the CUDA events from the
 508        first part of the run. Starting the profiler again does nothing.
 509      */
 510     if (cudaProfilerRun)
 511     {
 512         cudaError_t stat;
 513         stat = cudaProfilerStart();
 514         CU_RET_ERR(stat, "cudaProfilerStart failed");
 515     }
 516 }
 517
 518 void stopGpuProfiler(void)
 519 {
 520     /* Stopping the nvidia here allows us to eliminate the subsequent
 521        API calls from the trace, e.g. uninitialization and cleanup. */
 522     if (cudaProfilerRun)
 523     {
 524         cudaError_t stat;
 525         stat = cudaProfilerStop();
 526         CU_RET_ERR(stat, "cudaProfilerStop failed");
 527     }
 528 }
 529
 530 void resetGpuProfiler(void)
 531 {
 532     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 533      *  the profiling here (can't stop it) which will achieve the desired effect if
 534      *  the run was started with the profiling disabled.
 535      *
 536      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 537      * stopGpuProfiler();
 538      */
 539     if (cudaProfilerRun)
 540     {
 541         startGpuProfiler();
 542     }
 543 }
 544
 545 int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
 546 {
 547     return info.gpu_dev[index].stat;
 548 }
 549
 550 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
 551  * \param[in] stat           CUDA call return status
 552  * \param[in] gpuA           ID for GPU initiating peer access call
 553  * \param[in] gpuB           ID for remote GPU
 554  * \param[in] mdlog          Logger object
 555  * \param[in] cudaCallName   name of CUDA peer access call
 556  */
 557 static void peerAccessCheckStat(const cudaError_t stat, const int gpuA, const int gpuB, const gmx::MDLogger &mdlog, const char *cudaCallName)
 558 {
 559     if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
 560     {
 561         std::string errorString = gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
 562         CU_RET_ERR(stat, errorString.c_str());
 563     }
 564     if (stat != cudaSuccess)
 565     {
 566         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("GPU peer access not enabled between GPUs %d and %d due to unexpected return value from %s: %s",
 567                                                                  gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
 568     }
 569 }
 570
 571 void setupGpuDevicePeerAccess(const std::vector<int> &gpuIdsToUse, const gmx::MDLogger &mdlog)
 572 {
 573     cudaError_t stat;
 574
 575     // take a note of currently-set GPU
 576     int currentGpu;
 577     stat = cudaGetDevice(&currentGpu);
 578     CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
 579
 580     std::string message           = gmx::formatString("Note: Peer access enabled between the following GPU pairs in the node:\n ");
 581     bool        peerAccessEnabled = false;
 582
 583     for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
 584     {
 585         int gpuA = gpuIdsToUse[i];
 586         stat = cudaSetDevice(gpuA);
 587         if (stat != cudaSuccess)
 588         {
 589             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("GPU peer access not enabled due to unexpected return value from cudaSetDevice(%d): %s", gpuA, cudaGetErrorString(stat));
 590             return;
 591         }
 592         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
 593         {
 594             if (j != i)
 595             {
 596                 int gpuB          = gpuIdsToUse[j];
 597                 int canAccessPeer = 0;
 598                 stat = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
 599                 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
 600
 601                 if (canAccessPeer)
 602                 {
 603                     stat = cudaDeviceEnablePeerAccess(gpuB, 0);
 604                     peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
 605
 606                     message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
 607                     peerAccessEnabled = true;
 608                 }
 609             }
 610         }
 611     }
 612
 613     //re-set GPU to that originally set
 614     stat = cudaSetDevice(currentGpu);
 615     if (stat != cudaSuccess)
 616     {
 617         CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
 618         return;
 619     }
 620
 621     if (peerAccessEnabled)
 622     {
 623         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
 624     }
 625 }