src/gromacs/hardware/device_management.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Defines the CUDA implementations of the device management.
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Mark Abraham <mark.j.abraham@gmail.com>
  43  *  \author Szilárd Páll <pall.szilard@gmail.com>
  44  *  \author Artem Zhmurov <zhmurov@gmail.com>
  45  *
  46  * \ingroup module_hardware
  47  */
  48 #include "gmxpre.h"
  49
  50 #include "device_management.h"
  51
  52 #include <assert.h>
  53
  54 #include "gromacs/gpu_utils/cudautils.cuh"
  55 #include "gromacs/gpu_utils/device_context.h"
  56 #include "gromacs/gpu_utils/device_stream.h"
  57 #include "gromacs/utility/exceptions.h"
  58 #include "gromacs/utility/programcontext.h"
  59 #include "gromacs/utility/smalloc.h"
  60 #include "gromacs/utility/stringutil.h"
  61
  62 #include "device_information.h"
  63
  64 /*! \internal \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static const int c_cudaMaxDeviceCount = 32;
  70
  71 /** Dummy kernel used for sanity checking. */
  72 static __global__ void dummy_kernel() {}
  73
  74 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  75 {
  76     cudaFuncAttributes attributes;
  77     cudaError_t        stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
  78
  79     if (cudaErrorInvalidDeviceFunction == stat)
  80     {
  81         fprintf(stderr,
  82                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  83                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  84                 "By default, GROMACS supports all architectures of compute "
  85                 "capability >= 3.5, so your GPU "
  86                 "might be rare, or some architectures were disabled in the build. \n"
  87                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  88                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  89                 gmx::getProgramContext().displayName(),
  90                 deviceId,
  91                 deviceProp.major,
  92                 deviceProp.minor);
  93     }
  94
  95     return stat;
  96 }
  97
  98 /*!
  99  * \brief Runs GPU sanity checks.
 100  *
 101  * Runs a series of checks to determine that the given GPU and underlying CUDA
 102  * driver/runtime functions properly.
 103  *
 104  * \todo Currently we do not make a distinction between the type of errors
 105  *       that can appear during functionality checks. This needs to be improved,
 106  *       e.g if the dummy test kernel fails to execute with a "device busy message"
 107  *       we should appropriately report that the device is busy instead of NonFunctional.
 108  *
 109  * \todo Introduce errors codes and handle errors more smoothly.
 110  *
 111  *
 112  * \param[in]  deviceInfo  Device information on the device to check.
 113  * \returns                The status enumeration value for the checked device:
 114  */
 115 static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
 116 {
 117     cudaError_t cu_err;
 118
 119     /* both major & minor is 9999 if no CUDA capable devices are present */
 120     if (deviceInfo.prop.major == 9999 && deviceInfo.prop.minor == 9999)
 121     {
 122         return DeviceStatus::NonFunctional;
 123     }
 124     /* we don't care about emulation mode */
 125     if (deviceInfo.prop.major == 0)
 126     {
 127         return DeviceStatus::NonFunctional;
 128     }
 129
 130     cu_err = cudaSetDevice(deviceInfo.id);
 131     if (cu_err != cudaSuccess)
 132     {
 133         fprintf(stderr,
 134                 "Error while switching to device #%d. %s\n",
 135                 deviceInfo.id,
 136                 gmx::getDeviceErrorString(cu_err).c_str());
 137         return DeviceStatus::NonFunctional;
 138     }
 139
 140     cu_err = checkCompiledTargetCompatibility(deviceInfo.id, deviceInfo.prop);
 141     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 142     // it is enough to check for cudaErrorDevicesUnavailable only here because
 143     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 144     if (cu_err == cudaErrorDevicesUnavailable)
 145     {
 146         return DeviceStatus::Unavailable;
 147     }
 148     else if (cu_err != cudaSuccess)
 149     {
 150         return DeviceStatus::NonFunctional;
 151     }
 152
 153     /* try to execute a dummy kernel */
 154     try
 155     {
 156         KernelLaunchConfig config;
 157         config.blockSize[0]                = 512;
 158         const auto          dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
 159         const DeviceContext deviceContext(deviceInfo);
 160         const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 161         launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
 162     }
 163     catch (gmx::GromacsException& ex)
 164     {
 165         // launchGpuKernel error is not fatal and should continue with marking the device bad
 166         fprintf(stderr,
 167                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
 168                 deviceInfo.id,
 169                 formatExceptionMessageToString(ex).c_str());
 170         return DeviceStatus::NonFunctional;
 171     }
 172
 173     if (cudaDeviceSynchronize() != cudaSuccess)
 174     {
 175         return DeviceStatus::NonFunctional;
 176     }
 177
 178     cu_err = cudaDeviceReset();
 179     CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 180
 181     return DeviceStatus::Compatible;
 182 }
 183
 184 /*! \brief Returns true if the gpu characterized by the device properties is supported
 185  *         by the native gpu acceleration.
 186  *
 187  * \param[in] deviceProperties  The CUDA device properties of the gpus to test.
 188  * \returns                     True if the GPU properties passed indicate a compatible
 189  *                              GPU, otherwise false.
 190  */
 191 static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
 192 {
 193     return (deviceProperties.major >= 3);
 194 }
 195
 196 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 197  *
 198  *  Returns a status value which indicates compatibility or one of the following
 199  *  errors: incompatibility or insanity (=unexpected behavior).
 200  *
 201  *  As the error handling only permits returning the state of the GPU, this function
 202  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 203  *  upon return. Note that this also means it is the caller's responsibility to
 204  *  reset the CUDA runtime state.
 205  *
 206  *  \param[in]  deviceInfo The device information on the device to check.
 207  *  \returns               the status of the requested device
 208  */
 209 static DeviceStatus checkDeviceStatus(const DeviceInformation& deviceInfo)
 210 {
 211     if (!isDeviceGenerationSupported(deviceInfo.prop))
 212     {
 213         return DeviceStatus::Incompatible;
 214     }
 215     return isDeviceFunctional(deviceInfo);
 216 }
 217
 218 bool isDeviceDetectionFunctional(std::string* errorMessage)
 219 {
 220     cudaError_t stat;
 221     int         driverVersion = -1;
 222     stat                      = cudaDriverGetVersion(&driverVersion);
 223     GMX_ASSERT(stat != cudaErrorInvalidValue,
 224                "An impossible null pointer was passed to cudaDriverGetVersion");
 225     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 226                        ("An unexpected value was returned from cudaDriverGetVersion. "
 227                         + gmx::getDeviceErrorString(stat))
 228                                .c_str());
 229     bool foundDriver = (driverVersion > 0);
 230     if (!foundDriver)
 231     {
 232         // Can't detect GPUs if there is no driver
 233         if (errorMessage != nullptr)
 234         {
 235             errorMessage->assign("No valid CUDA driver found");
 236         }
 237         return false;
 238     }
 239
 240     int numDevices;
 241     stat = cudaGetDeviceCount(&numDevices);
 242     if (stat != cudaSuccess)
 243     {
 244         if (errorMessage != nullptr)
 245         {
 246             /* cudaGetDeviceCount failed which means that there is
 247              * something wrong with the machine: driver-runtime
 248              * mismatch, all GPUs being busy in exclusive mode,
 249              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 250              * which should result in GROMACS issuing at least a
 251              * warning. */
 252             errorMessage->assign(cudaGetErrorString(stat));
 253         }
 254
 255         // Consume the error now that we have prepared to handle
 256         // it. This stops it reappearing next time we check for
 257         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 258         // valid devices, then cudaGetLastError returns the
 259         // (undocumented) cudaErrorNoDevice, but this should not be a
 260         // problem as there should be no future CUDA API calls.
 261         // NVIDIA bug report #2038718 has been filed.
 262         cudaGetLastError();
 263         // Can't detect GPUs
 264         return false;
 265     }
 266
 267     // We don't actually use numDevices here, that's not the job of
 268     // this function.
 269     return true;
 270 }
 271
 272 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
 273 {
 274     int         numDevices;
 275     cudaError_t stat = cudaGetDeviceCount(&numDevices);
 276     gmx::checkDeviceError(stat,
 277                           "Invalid call of findDevices() when CUDA API returned an error, perhaps "
 278                           "canPerformDeviceDetection() was not called appropriately beforehand.");
 279
 280     /* things might go horribly wrong if cudart is not compatible with the driver */
 281     numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
 282
 283     // We expect to start device support/sanity checks with a clean runtime error state
 284     gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
 285
 286     std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
 287     for (int i = 0; i < numDevices; i++)
 288     {
 289         cudaDeviceProp prop;
 290         memset(&prop, 0, sizeof(cudaDeviceProp));
 291         stat = cudaGetDeviceProperties(&prop, i);
 292
 293         deviceInfoList[i]               = std::make_unique<DeviceInformation>();
 294         deviceInfoList[i]->id           = i;
 295         deviceInfoList[i]->prop         = prop;
 296         deviceInfoList[i]->deviceVendor = DeviceVendor::Nvidia;
 297
 298         const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional
 299                                                                : checkDeviceStatus(*deviceInfoList[i]);
 300
 301         deviceInfoList[i]->status = checkResult;
 302
 303         if (checkResult != DeviceStatus::Compatible)
 304         {
 305             // TODO:
 306             //  - we inspect the CUDA API state to retrieve and record any
 307             //    errors that occurred during is_gmx_supported_gpu_id() here,
 308             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 309             //    and only return a string with the error if one was encountered.
 310             //  - we'll be reporting without rank information which is not ideal.
 311             //  - we'll end up warning also in cases where users would already
 312             //    get an error before mdrun aborts.
 313             //
 314             // Here we also clear the CUDA API error state so potential
 315             // errors during sanity checks don't propagate.
 316             const std::string errorMessage = gmx::formatString(
 317                     "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
 318             gmx::ensureNoPendingDeviceError(errorMessage);
 319         }
 320     }
 321
 322     stat = cudaPeekAtLastError();
 323     GMX_RELEASE_ASSERT(
 324             stat == cudaSuccess,
 325             ("We promise to return with clean CUDA state, but non-success state encountered. "
 326              + gmx::getDeviceErrorString(stat))
 327                     .c_str());
 328
 329     return deviceInfoList;
 330 }
 331
 332 void setActiveDevice(const DeviceInformation& deviceInfo)
 333 {
 334     int         deviceId = deviceInfo.id;
 335     cudaError_t stat;
 336
 337     stat = cudaSetDevice(deviceId);
 338     if (stat != cudaSuccess)
 339     {
 340         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
 341         CU_RET_ERR(stat, message);
 342     }
 343
 344     if (debug)
 345     {
 346         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
 347     }
 348 }
 349
 350 void releaseDevice(DeviceInformation* deviceInfo)
 351 {
 352     // device was used is that deviceInfo will be non-null.
 353     if (deviceInfo != nullptr)
 354     {
 355         cudaError_t stat;
 356
 357         int gpuid;
 358         stat = cudaGetDevice(&gpuid);
 359         if (stat == cudaSuccess)
 360         {
 361             if (debug)
 362             {
 363                 fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
 364             }
 365
 366             stat = cudaDeviceReset();
 367             if (stat != cudaSuccess)
 368             {
 369                 gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
 370             }
 371         }
 372     }
 373 }
 374
 375 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
 376 {
 377     bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
 378                       && deviceInfo.status != DeviceStatus::NonFunctional);
 379
 380     if (!gpuExists)
 381     {
 382         return gmx::formatString(
 383                 "#%d: %s, stat: %s", deviceInfo.id, "N/A", c_deviceStateString[deviceInfo.status]);
 384     }
 385     else
 386     {
 387         return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 388                                  deviceInfo.id,
 389                                  deviceInfo.prop.name,
 390                                  deviceInfo.prop.major,
 391                                  deviceInfo.prop.minor,
 392                                  deviceInfo.prop.ECCEnabled ? "yes" : " no",
 393                                  c_deviceStateString[deviceInfo.status]);
 394     }
 395 }