src/gromacs/hardware/device_management.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Defines the CUDA implementations of the device management.
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Mark Abraham <mark.j.abraham@gmail.com>
  43  *  \author Szilárd Páll <pall.szilard@gmail.com>
  44  *  \author Artem Zhmurov <zhmurov@gmail.com>
  45  *
  46  * \ingroup module_hardware
  47  */
  48 #include "gmxpre.h"
  49
  50 #include "device_management.h"
  51
  52 #include <assert.h>
  53
  54 #include "gromacs/gpu_utils/cudautils.cuh"
  55 #include "gromacs/gpu_utils/device_context.h"
  56 #include "gromacs/gpu_utils/device_stream.h"
  57 #include "gromacs/utility/exceptions.h"
  58 #include "gromacs/utility/programcontext.h"
  59 #include "gromacs/utility/smalloc.h"
  60 #include "gromacs/utility/stringutil.h"
  61
  62 #include "device_information.h"
  63
  64 /*! \internal \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static int c_cudaMaxDeviceCount = 32;
  70
  71 /** Dummy kernel used for sanity checking. */
  72 static __global__ void dummy_kernel(void) {}
  73
  74 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  75 {
  76     cudaFuncAttributes attributes;
  77     cudaError_t        stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
  78
  79     if (cudaErrorInvalidDeviceFunction == stat)
  80     {
  81         fprintf(stderr,
  82                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  83                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  84                 "By default, GROMACS supports all architectures of compute "
  85                 "capability >= 3.0, so your GPU "
  86                 "might be rare, or some architectures were disabled in the build. \n"
  87                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  88                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  89                 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
  90     }
  91
  92     return stat;
  93 }
  94
  95 /*!
  96  * \brief Runs GPU sanity checks.
  97  *
  98  * Runs a series of checks to determine that the given GPU and underlying CUDA
  99  * driver/runtime functions properly.
 100  *
 101  * \todo Currently we do not make a distinction between the type of errors
 102  *       that can appear during functionality checks. This needs to be improved,
 103  *       e.g if the dummy test kernel fails to execute with a "device busy message"
 104  *       we should appropriately report that the device is busy instead of NonFunctional.
 105  *
 106  * \todo Introduce errors codes and handle errors more smoothly.
 107  *
 108  *
 109  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 110  * \param[in]  dev_prop    The device properties structure
 111  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 112  */
 113 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
 114 {
 115     cudaError_t cu_err;
 116     int         dev_count, id;
 117
 118     cu_err = cudaGetDeviceCount(&dev_count);
 119     if (cu_err != cudaSuccess)
 120     {
 121         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 122         return DeviceStatus::NonFunctional;
 123     }
 124
 125     /* no CUDA compatible device at all */
 126     if (dev_count == 0)
 127     {
 128         return DeviceStatus::NonFunctional;
 129     }
 130
 131     /* things might go horribly wrong if cudart is not compatible with the driver */
 132     if (dev_count < 0 || dev_count > c_cudaMaxDeviceCount)
 133     {
 134         return DeviceStatus::NonFunctional;
 135     }
 136
 137     if (dev_id == -1) /* device already selected let's not destroy the context */
 138     {
 139         cu_err = cudaGetDevice(&id);
 140         if (cu_err != cudaSuccess)
 141         {
 142             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 143             return DeviceStatus::NonFunctional;
 144         }
 145     }
 146     else
 147     {
 148         id = dev_id;
 149         if (id > dev_count - 1) /* pfff there's no such device */
 150         {
 151             fprintf(stderr,
 152                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 153                     dev_id, dev_count);
 154             return DeviceStatus::NonFunctional;
 155         }
 156     }
 157
 158     /* both major & minor is 9999 if no CUDA capable devices are present */
 159     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 160     {
 161         return DeviceStatus::NonFunctional;
 162     }
 163     /* we don't care about emulation mode */
 164     if (dev_prop.major == 0)
 165     {
 166         return DeviceStatus::NonFunctional;
 167     }
 168
 169     if (id != -1)
 170     {
 171         cu_err = cudaSetDevice(id);
 172         if (cu_err != cudaSuccess)
 173         {
 174             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 175                     cudaGetErrorString(cu_err));
 176             return DeviceStatus::NonFunctional;
 177         }
 178     }
 179
 180     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 181     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 182     // it is enough to check for cudaErrorDevicesUnavailable only here because
 183     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 184     if (cu_err == cudaErrorDevicesUnavailable)
 185     {
 186         return DeviceStatus::Unavailable;
 187     }
 188     else if (cu_err != cudaSuccess)
 189     {
 190         return DeviceStatus::NonFunctional;
 191     }
 192
 193     /* try to execute a dummy kernel */
 194     try
 195     {
 196         KernelLaunchConfig config;
 197         config.blockSize[0]                = 512;
 198         const auto          dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
 199         DeviceInformation   deviceInfo;
 200         const DeviceContext deviceContext(deviceInfo);
 201         const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 202         launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
 203     }
 204     catch (gmx::GromacsException& ex)
 205     {
 206         // launchGpuKernel error is not fatal and should continue with marking the device bad
 207         fprintf(stderr,
 208                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
 209                 formatExceptionMessageToString(ex).c_str());
 210         return DeviceStatus::NonFunctional;
 211     }
 212
 213     if (cudaDeviceSynchronize() != cudaSuccess)
 214     {
 215         return DeviceStatus::NonFunctional;
 216     }
 217
 218     /* destroy context if we created one */
 219     if (id != -1)
 220     {
 221         cu_err = cudaDeviceReset();
 222         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 223     }
 224
 225     return DeviceStatus::Compatible;
 226 }
 227
 228 /*! \brief Returns true if the gpu characterized by the device properties is supported
 229  *         by the native gpu acceleration.
 230  *
 231  * \param[in] deviceProperties  The CUDA device properties of the gpus to test.
 232  * \returns                     True if the GPU properties passed indicate a compatible
 233  *                              GPU, otherwise false.
 234  */
 235 static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
 236 {
 237     return (deviceProperties.major >= 3);
 238 }
 239
 240 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 241  *
 242  *  Returns a status value which indicates compatibility or one of the following
 243  *  errors: incompatibility or insanity (=unexpected behavior).
 244  *
 245  *  As the error handling only permits returning the state of the GPU, this function
 246  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 247  *  upon return. Note that this also means it is the caller's responsibility to
 248  *  reset the CUDA runtime state.
 249  *
 250  *  \param[in]  deviceId   the ID of the GPU to check.
 251  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 252  *  \returns               the status of the requested device
 253  */
 254 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
 255 {
 256     if (!isDeviceGenerationSupported(deviceProp))
 257     {
 258         return DeviceStatus::Incompatible;
 259     }
 260     return isDeviceFunctional(deviceId, deviceProp);
 261 }
 262
 263 bool isDeviceDetectionFunctional(std::string* errorMessage)
 264 {
 265     cudaError_t stat;
 266     int         driverVersion = -1;
 267     stat                      = cudaDriverGetVersion(&driverVersion);
 268     GMX_ASSERT(stat != cudaErrorInvalidValue,
 269                "An impossible null pointer was passed to cudaDriverGetVersion");
 270     GMX_RELEASE_ASSERT(
 271             stat == cudaSuccess,
 272             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 273                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 274                     .c_str());
 275     bool foundDriver = (driverVersion > 0);
 276     if (!foundDriver)
 277     {
 278         // Can't detect GPUs if there is no driver
 279         if (errorMessage != nullptr)
 280         {
 281             errorMessage->assign("No valid CUDA driver found");
 282         }
 283         return false;
 284     }
 285
 286     int numDevices;
 287     stat = cudaGetDeviceCount(&numDevices);
 288     if (stat != cudaSuccess)
 289     {
 290         if (errorMessage != nullptr)
 291         {
 292             /* cudaGetDeviceCount failed which means that there is
 293              * something wrong with the machine: driver-runtime
 294              * mismatch, all GPUs being busy in exclusive mode,
 295              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 296              * which should result in GROMACS issuing at least a
 297              * warning. */
 298             errorMessage->assign(cudaGetErrorString(stat));
 299         }
 300
 301         // Consume the error now that we have prepared to handle
 302         // it. This stops it reappearing next time we check for
 303         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 304         // valid devices, then cudaGetLastError returns the
 305         // (undocumented) cudaErrorNoDevice, but this should not be a
 306         // problem as there should be no future CUDA API calls.
 307         // NVIDIA bug report #2038718 has been filed.
 308         cudaGetLastError();
 309         // Can't detect GPUs
 310         return false;
 311     }
 312
 313     // We don't actually use numDevices here, that's not the job of
 314     // this function.
 315     return true;
 316 }
 317
 318 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
 319 {
 320     int         numDevices;
 321     cudaError_t stat = cudaGetDeviceCount(&numDevices);
 322     if (stat != cudaSuccess)
 323     {
 324         GMX_THROW(gmx::InternalError(
 325                 "Invalid call of findDevices() when CUDA API returned an error, perhaps "
 326                 "canPerformDeviceDetection() was not called appropriately beforehand."));
 327     }
 328
 329     // We expect to start device support/sanity checks with a clean runtime error state
 330     gmx::ensureNoPendingCudaError("");
 331
 332     std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
 333     for (int i = 0; i < numDevices; i++)
 334     {
 335         cudaDeviceProp prop;
 336         memset(&prop, 0, sizeof(cudaDeviceProp));
 337         stat = cudaGetDeviceProperties(&prop, i);
 338         const DeviceStatus checkResult =
 339                 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
 340
 341         deviceInfoList[i] = std::make_unique<DeviceInformation>();
 342
 343         deviceInfoList[i]->id     = i;
 344         deviceInfoList[i]->prop   = prop;
 345         deviceInfoList[i]->status = checkResult;
 346
 347         if (checkResult != DeviceStatus::Compatible)
 348         {
 349             // TODO:
 350             //  - we inspect the CUDA API state to retrieve and record any
 351             //    errors that occurred during is_gmx_supported_gpu_id() here,
 352             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 353             //    and only return a string with the error if one was encountered.
 354             //  - we'll be reporting without rank information which is not ideal.
 355             //  - we'll end up warning also in cases where users would already
 356             //    get an error before mdrun aborts.
 357             //
 358             // Here we also clear the CUDA API error state so potential
 359             // errors during sanity checks don't propagate.
 360             if ((stat = cudaGetLastError()) != cudaSuccess)
 361             {
 362                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 363                             deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 364             }
 365         }
 366     }
 367
 368     stat = cudaPeekAtLastError();
 369     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 370                        gmx::formatString("We promise to return with clean CUDA state, but "
 371                                          "non-success state encountered: %s: %s",
 372                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 373                                .c_str());
 374
 375     return deviceInfoList;
 376 }
 377
 378 void setActiveDevice(const DeviceInformation& deviceInfo)
 379 {
 380     int         deviceId = deviceInfo.id;
 381     cudaError_t stat;
 382
 383     stat = cudaSetDevice(deviceId);
 384     if (stat != cudaSuccess)
 385     {
 386         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
 387         CU_RET_ERR(stat, message.c_str());
 388     }
 389
 390     if (debug)
 391     {
 392         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
 393     }
 394 }
 395
 396 void releaseDevice(DeviceInformation* deviceInfo)
 397 {
 398     // device was used is that deviceInfo will be non-null.
 399     if (deviceInfo != nullptr)
 400     {
 401         cudaError_t stat;
 402
 403         int gpuid;
 404         stat = cudaGetDevice(&gpuid);
 405         if (stat == cudaSuccess)
 406         {
 407             if (debug)
 408             {
 409                 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 410             }
 411
 412             stat = cudaDeviceReset();
 413             if (stat != cudaSuccess)
 414             {
 415                 gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
 416             }
 417         }
 418     }
 419 }
 420
 421 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
 422 {
 423     bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
 424                       && deviceInfo.status != DeviceStatus::NonFunctional);
 425
 426     if (!gpuExists)
 427     {
 428         return gmx::formatString("#%d: %s, stat: %s", deviceInfo.id, "N/A",
 429                                  c_deviceStateString[deviceInfo.status]);
 430     }
 431     else
 432     {
 433         return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 434                                  deviceInfo.id, deviceInfo.prop.name, deviceInfo.prop.major,
 435                                  deviceInfo.prop.minor, deviceInfo.prop.ECCEnabled ? "yes" : " no",
 436                                  c_deviceStateString[deviceInfo.status]);
 437     }
 438 }