src/gromacs/hardware/device_management.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Defines the CUDA implementations of the device management.
  38  *
  39  *  \author Artem Zhmurov <zhmurov@gmail.com>
  40  *
  41  * \ingroup module_hardware
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "device_management.h"
  46
  47 #include <assert.h>
  48
  49 #include "gromacs/gpu_utils/cudautils.cuh"
  50 #include "gromacs/gpu_utils/device_context.h"
  51 #include "gromacs/gpu_utils/device_stream.h"
  52 #include "gromacs/utility/programcontext.h"
  53 #include "gromacs/utility/smalloc.h"
  54
  55 /*! \internal \brief
  56  * Max number of devices supported by CUDA (for consistency checking).
  57  *
  58  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  59  */
  60 static int cuda_max_device_count = 32;
  61
  62 /** Dummy kernel used for sanity checking. */
  63 static __global__ void k_dummy_test(void) {}
  64
  65 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  66 {
  67     cudaFuncAttributes attributes;
  68     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  69
  70     if (cudaErrorInvalidDeviceFunction == stat)
  71     {
  72         fprintf(stderr,
  73                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  74                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  75                 "By default, GROMACS supports all architectures of compute "
  76                 "capability >= 3.0, so your GPU "
  77                 "might be rare, or some architectures were disabled in the build. \n"
  78                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  79                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  80                 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
  81     }
  82
  83     return stat;
  84 }
  85
  86 /*!
  87  * \brief Runs GPU sanity checks.
  88  *
  89  * Runs a series of checks to determine that the given GPU and underlying CUDA
  90  * driver/runtime functions properly.
  91  *
  92  * \todo Currently we do not make a distinction between the type of errors
  93  *       that can appear during functionality checks. This needs to be improved,
  94  *       e.g if the dummy test kernel fails to execute with a "device busy message"
  95  *       we should appropriately report that the device is busy instead of NonFunctional.
  96  *
  97  * \todo Introduce errors codes and handle errors more smoothly.
  98  *
  99  *
 100  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 101  * \param[in]  dev_prop    The device properties structure
 102  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 103  */
 104 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
 105 {
 106     cudaError_t cu_err;
 107     int         dev_count, id;
 108
 109     cu_err = cudaGetDeviceCount(&dev_count);
 110     if (cu_err != cudaSuccess)
 111     {
 112         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 113         return DeviceStatus::NonFunctional;
 114     }
 115
 116     /* no CUDA compatible device at all */
 117     if (dev_count == 0)
 118     {
 119         return DeviceStatus::NonFunctional;
 120     }
 121
 122     /* things might go horribly wrong if cudart is not compatible with the driver */
 123     if (dev_count < 0 || dev_count > cuda_max_device_count)
 124     {
 125         return DeviceStatus::NonFunctional;
 126     }
 127
 128     if (dev_id == -1) /* device already selected let's not destroy the context */
 129     {
 130         cu_err = cudaGetDevice(&id);
 131         if (cu_err != cudaSuccess)
 132         {
 133             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 134             return DeviceStatus::NonFunctional;
 135         }
 136     }
 137     else
 138     {
 139         id = dev_id;
 140         if (id > dev_count - 1) /* pfff there's no such device */
 141         {
 142             fprintf(stderr,
 143                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 144                     dev_id, dev_count);
 145             return DeviceStatus::NonFunctional;
 146         }
 147     }
 148
 149     /* both major & minor is 9999 if no CUDA capable devices are present */
 150     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 151     {
 152         return DeviceStatus::NonFunctional;
 153     }
 154     /* we don't care about emulation mode */
 155     if (dev_prop.major == 0)
 156     {
 157         return DeviceStatus::NonFunctional;
 158     }
 159
 160     if (id != -1)
 161     {
 162         cu_err = cudaSetDevice(id);
 163         if (cu_err != cudaSuccess)
 164         {
 165             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 166                     cudaGetErrorString(cu_err));
 167             return DeviceStatus::NonFunctional;
 168         }
 169     }
 170
 171     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 172     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 173     // it is enough to check for cudaErrorDevicesUnavailable only here because
 174     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 175     if (cu_err == cudaErrorDevicesUnavailable)
 176     {
 177         return DeviceStatus::Unavailable;
 178     }
 179     else if (cu_err != cudaSuccess)
 180     {
 181         return DeviceStatus::NonFunctional;
 182     }
 183
 184     /* try to execute a dummy kernel */
 185     try
 186     {
 187         KernelLaunchConfig config;
 188         config.blockSize[0]                = 512;
 189         const auto          dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 190         DeviceInformation   deviceInfo;
 191         const DeviceContext deviceContext(deviceInfo);
 192         const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 193         launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
 194     }
 195     catch (gmx::GromacsException& ex)
 196     {
 197         // launchGpuKernel error is not fatal and should continue with marking the device bad
 198         fprintf(stderr,
 199                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
 200                 formatExceptionMessageToString(ex).c_str());
 201         return DeviceStatus::NonFunctional;
 202     }
 203
 204     if (cudaDeviceSynchronize() != cudaSuccess)
 205     {
 206         return DeviceStatus::NonFunctional;
 207     }
 208
 209     /* destroy context if we created one */
 210     if (id != -1)
 211     {
 212         cu_err = cudaDeviceReset();
 213         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 214     }
 215
 216     return DeviceStatus::Compatible;
 217 }
 218
 219 /*! \brief Returns true if the gpu characterized by the device properties is
 220  *  supported by the native gpu acceleration.
 221  *
 222  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 223  * \returns             true if the GPU properties passed indicate a compatible
 224  *                      GPU, otherwise false.
 225  */
 226 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
 227 {
 228     return (dev_prop.major >= 3);
 229 }
 230
 231 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 232  *
 233  *  Returns a status value which indicates compatibility or one of the following
 234  *  errors: incompatibility or insanity (=unexpected behavior).
 235  *
 236  *  As the error handling only permits returning the state of the GPU, this function
 237  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 238  *  upon return. Note that this also means it is the caller's responsibility to
 239  *  reset the CUDA runtime state.
 240  *
 241  *  \param[in]  deviceId   the ID of the GPU to check.
 242  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 243  *  \returns               the status of the requested device
 244  */
 245 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
 246 {
 247     if (!is_gmx_supported_gpu(deviceProp))
 248     {
 249         return DeviceStatus::Incompatible;
 250     }
 251     return isDeviceFunctional(deviceId, deviceProp);
 252 }
 253
 254 bool isGpuDetectionFunctional(std::string* errorMessage)
 255 {
 256     cudaError_t stat;
 257     int         driverVersion = -1;
 258     stat                      = cudaDriverGetVersion(&driverVersion);
 259     GMX_ASSERT(stat != cudaErrorInvalidValue,
 260                "An impossible null pointer was passed to cudaDriverGetVersion");
 261     GMX_RELEASE_ASSERT(
 262             stat == cudaSuccess,
 263             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 264                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 265                     .c_str());
 266     bool foundDriver = (driverVersion > 0);
 267     if (!foundDriver)
 268     {
 269         // Can't detect GPUs if there is no driver
 270         if (errorMessage != nullptr)
 271         {
 272             errorMessage->assign("No valid CUDA driver found");
 273         }
 274         return false;
 275     }
 276
 277     int numDevices;
 278     stat = cudaGetDeviceCount(&numDevices);
 279     if (stat != cudaSuccess)
 280     {
 281         if (errorMessage != nullptr)
 282         {
 283             /* cudaGetDeviceCount failed which means that there is
 284              * something wrong with the machine: driver-runtime
 285              * mismatch, all GPUs being busy in exclusive mode,
 286              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 287              * which should result in GROMACS issuing at least a
 288              * warning. */
 289             errorMessage->assign(cudaGetErrorString(stat));
 290         }
 291
 292         // Consume the error now that we have prepared to handle
 293         // it. This stops it reappearing next time we check for
 294         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 295         // valid devices, then cudaGetLastError returns the
 296         // (undocumented) cudaErrorNoDevice, but this should not be a
 297         // problem as there should be no future CUDA API calls.
 298         // NVIDIA bug report #2038718 has been filed.
 299         cudaGetLastError();
 300         // Can't detect GPUs
 301         return false;
 302     }
 303
 304     // We don't actually use numDevices here, that's not the job of
 305     // this function.
 306     return true;
 307 }
 308
 309 void findGpus(gmx_gpu_info_t* gpu_info)
 310 {
 311     assert(gpu_info);
 312
 313     gpu_info->n_dev_compatible = 0;
 314
 315     int         ndev;
 316     cudaError_t stat = cudaGetDeviceCount(&ndev);
 317     if (stat != cudaSuccess)
 318     {
 319         GMX_THROW(gmx::InternalError(
 320                 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
 321                 "canDetectGpus() was not called appropriately beforehand."));
 322     }
 323
 324     // We expect to start device support/sanity checks with a clean runtime error state
 325     gmx::ensureNoPendingCudaError("");
 326
 327     DeviceInformation* devs;
 328     snew(devs, ndev);
 329     for (int i = 0; i < ndev; i++)
 330     {
 331         cudaDeviceProp prop;
 332         memset(&prop, 0, sizeof(cudaDeviceProp));
 333         stat = cudaGetDeviceProperties(&prop, i);
 334         const DeviceStatus checkResult =
 335                 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
 336
 337         devs[i].id   = i;
 338         devs[i].prop = prop;
 339         devs[i].stat = checkResult;
 340
 341         if (checkResult == DeviceStatus::Compatible)
 342         {
 343             gpu_info->n_dev_compatible++;
 344         }
 345         else
 346         {
 347             // TODO:
 348             //  - we inspect the CUDA API state to retrieve and record any
 349             //    errors that occurred during is_gmx_supported_gpu_id() here,
 350             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 351             //    and only return a string with the error if one was encountered.
 352             //  - we'll be reporting without rank information which is not ideal.
 353             //  - we'll end up warning also in cases where users would already
 354             //    get an error before mdrun aborts.
 355             //
 356             // Here we also clear the CUDA API error state so potential
 357             // errors during sanity checks don't propagate.
 358             if ((stat = cudaGetLastError()) != cudaSuccess)
 359             {
 360                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 361                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 362             }
 363         }
 364     }
 365
 366     stat = cudaPeekAtLastError();
 367     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 368                        gmx::formatString("We promise to return with clean CUDA state, but "
 369                                          "non-success state encountered: %s: %s",
 370                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 371                                .c_str());
 372
 373     gpu_info->n_dev      = ndev;
 374     gpu_info->deviceInfo = devs;
 375 }
 376
 377 void init_gpu(const DeviceInformation* deviceInfo)
 378 {
 379     cudaError_t stat;
 380
 381     assert(deviceInfo);
 382
 383     stat = cudaSetDevice(deviceInfo->id);
 384     if (stat != cudaSuccess)
 385     {
 386         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 387         CU_RET_ERR(stat, message.c_str());
 388     }
 389
 390     if (debug)
 391     {
 392         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 393     }
 394 }
 395
 396 void free_gpu(const DeviceInformation* deviceInfo)
 397 {
 398     // One should only attempt to clear the device context when
 399     // it has been used, but currently the only way to know that a GPU
 400     // device was used is that deviceInfo will be non-null.
 401     if (deviceInfo == nullptr)
 402     {
 403         return;
 404     }
 405
 406     cudaError_t stat;
 407
 408     if (debug)
 409     {
 410         int gpuid;
 411         stat = cudaGetDevice(&gpuid);
 412         CU_RET_ERR(stat, "cudaGetDevice failed");
 413         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 414     }
 415
 416     stat = cudaDeviceReset();
 417     if (stat != cudaSuccess)
 418     {
 419         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 420     }
 421 }
 422
 423 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
 424 {
 425     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 426     {
 427         gmx_incons("Invalid GPU deviceId requested");
 428     }
 429     return &gpu_info.deviceInfo[deviceId];
 430 }
 431
 432 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
 433 {
 434     assert(s);
 435
 436     if (index < 0 && index >= gpu_info.n_dev)
 437     {
 438         return;
 439     }
 440
 441     DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
 442
 443     bool bGpuExists =
 444             (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
 445
 446     if (!bGpuExists)
 447     {
 448         sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
 449     }
 450     else
 451     {
 452         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
 453                 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
 454                 dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
 455     }
 456 }
 457
 458 size_t sizeof_gpu_dev_info(void)
 459 {
 460     return sizeof(DeviceInformation);
 461 }
 462
 463 DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
 464 {
 465     return info.deviceInfo[index].stat;
 466 }