2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gpu_utils/cudautils.cuh"
52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
53 #include "gromacs/hardware/gpu_hw_info.h"
54 #include "gromacs/utility/basedefinitions.h"
55 #include "gromacs/utility/cstringutil.h"
56 #include "gromacs/utility/exceptions.h"
57 #include "gromacs/utility/fatalerror.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/programcontext.h"
60 #include "gromacs/utility/smalloc.h"
61 #include "gromacs/utility/snprintf.h"
62 #include "gromacs/utility/stringutil.h"
65 * Max number of devices supported by CUDA (for consistency checking).
67 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
69 static int cuda_max_device_count = 32;
71 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
73 /** Dummy kernel used for sanity checking. */
74 static __global__ void k_dummy_test(void)
78 static cudaError_t checkCompiledTargetCompatibility(int deviceId,
79 const cudaDeviceProp &deviceProp)
81 cudaFuncAttributes attributes;
82 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
84 if (cudaErrorInvalidDeviceFunction == stat)
87 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
88 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
89 "By default, GROMACS supports all architectures of compute "
90 "capability >= 3.0, so your GPU "
91 "might be rare, or some architectures were disabled in the build. \n"
92 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
93 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
94 gmx::getProgramContext().displayName(), deviceId,
95 deviceProp.major, deviceProp.minor);
101 bool isHostMemoryPinned(const void *h_ptr)
103 cudaPointerAttributes memoryAttributes;
104 cudaError_t stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
113 case cudaErrorInvalidValue:
114 // If the buffer was not pinned, then it will not be recognized by CUDA at all
116 // Reset the last error status
121 CU_RET_ERR(stat, "Unexpected CUDA error");
127 * \brief Runs GPU sanity checks.
129 * Runs a series of checks to determine that the given GPU and underlying CUDA
130 * driver/runtime functions properly.
132 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
133 * \param[in] dev_prop The device properties structure
134 * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
136 * TODO: introduce errors codes and handle errors more smoothly.
138 static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
143 cu_err = cudaGetDeviceCount(&dev_count);
144 if (cu_err != cudaSuccess)
146 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
147 cudaGetErrorString(cu_err));
151 /* no CUDA compatible device at all */
157 /* things might go horribly wrong if cudart is not compatible with the driver */
158 if (dev_count < 0 || dev_count > cuda_max_device_count)
163 if (dev_id == -1) /* device already selected let's not destroy the context */
165 cu_err = cudaGetDevice(&id);
166 if (cu_err != cudaSuccess)
168 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
169 cudaGetErrorString(cu_err));
176 if (id > dev_count - 1) /* pfff there's no such device */
178 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
184 /* both major & minor is 9999 if no CUDA capable devices are present */
185 if (dev_prop.major == 9999 && dev_prop.minor == 9999)
189 /* we don't care about emulation mode */
190 if (dev_prop.major == 0)
197 cu_err = cudaSetDevice(id);
198 if (cu_err != cudaSuccess)
200 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
201 cu_err, id, cudaGetErrorString(cu_err));
206 cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
207 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
208 // it is enough to check for cudaErrorDevicesUnavailable only here because
209 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
210 if (cu_err == cudaErrorDevicesUnavailable)
214 else if (cu_err != cudaSuccess)
219 /* try to execute a dummy kernel */
222 KernelLaunchConfig config;
223 config.blockSize[0] = 512;
224 const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
225 launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
227 catch (gmx::GromacsException &ex)
229 // launchGpuKernel error is not fatal and should continue with marking the device bad
230 fprintf(stderr, "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
231 id, formatExceptionMessageToString(ex).c_str());
235 if (cudaDeviceSynchronize() != cudaSuccess)
240 /* destroy context if we created one */
243 cu_err = cudaDeviceReset();
244 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
250 void init_gpu(const gmx_device_info_t *deviceInfo)
256 stat = cudaSetDevice(deviceInfo->id);
257 if (stat != cudaSuccess)
259 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
260 CU_RET_ERR(stat, message.c_str());
265 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
269 void free_gpu(const gmx_device_info_t *deviceInfo)
271 // One should only attempt to clear the device context when
272 // it has been used, but currently the only way to know that a GPU
273 // device was used is that deviceInfo will be non-null.
274 if (deviceInfo == nullptr)
284 stat = cudaGetDevice(&gpuid);
285 CU_RET_ERR(stat, "cudaGetDevice failed");
286 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
289 stat = cudaDeviceReset();
290 if (stat != cudaSuccess)
292 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
296 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
299 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
301 gmx_incons("Invalid GPU deviceId requested");
303 return &gpu_info.gpu_dev[deviceId];
306 /*! \brief Returns true if the gpu characterized by the device properties is
307 * supported by the native gpu acceleration.
309 * \param[in] dev_prop the CUDA device properties of the gpus to test.
310 * \returns true if the GPU properties passed indicate a compatible
311 * GPU, otherwise false.
313 static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
315 return (dev_prop.major >= 3);
318 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
320 * Returns a status value which indicates compatibility or one of the following
321 * errors: incompatibility or insanity (=unexpected behavior).
323 * As the error handling only permits returning the state of the GPU, this function
324 * does not clear the CUDA runtime API status allowing the caller to inspect the error
325 * upon return. Note that this also means it is the caller's responsibility to
326 * reset the CUDA runtime state.
328 * \param[in] deviceId the ID of the GPU to check.
329 * \param[in] deviceProp the CUDA device properties of the device checked.
330 * \returns the status of the requested device
332 static int is_gmx_supported_gpu_id(int deviceId,
333 const cudaDeviceProp &deviceProp)
335 if (!is_gmx_supported_gpu(deviceProp))
337 return egpuIncompatible;
340 /* TODO: currently we do not make a distinction between the type of errors
341 * that can appear during sanity checks. This needs to be improved, e.g if
342 * the dummy test kernel fails to execute with a "device busy message" we
343 * should appropriately report that the device is busy instead of insane.
345 const int checkResult = do_sanity_checks(deviceId, deviceProp);
348 case 0: return egpuCompatible;
349 case -1: return egpuInsane;
350 case -2: return egpuUnavailable;
351 default: GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
352 return egpuCompatible;
356 bool canDetectGpus(std::string *errorMessage)
359 int driverVersion = -1;
360 stat = cudaDriverGetVersion(&driverVersion);
361 GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
362 GMX_RELEASE_ASSERT(stat == cudaSuccess,
363 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
364 cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
365 bool foundDriver = (driverVersion > 0);
368 // Can't detect GPUs if there is no driver
369 if (errorMessage != nullptr)
371 errorMessage->assign("No valid CUDA driver found");
377 stat = cudaGetDeviceCount(&numDevices);
378 if (stat != cudaSuccess)
380 if (errorMessage != nullptr)
382 /* cudaGetDeviceCount failed which means that there is
383 * something wrong with the machine: driver-runtime
384 * mismatch, all GPUs being busy in exclusive mode,
385 * invalid CUDA_VISIBLE_DEVICES, or some other condition
386 * which should result in GROMACS issuing at least a
388 errorMessage->assign(cudaGetErrorString(stat));
391 // Consume the error now that we have prepared to handle
392 // it. This stops it reappearing next time we check for
393 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
394 // valid devices, then cudaGetLastError returns the
395 // (undocumented) cudaErrorNoDevice, but this should not be a
396 // problem as there should be no future CUDA API calls.
397 // NVIDIA bug report #2038718 has been filed.
403 // We don't actually use numDevices here, that's not the job of
408 void findGpus(gmx_gpu_info_t *gpu_info)
412 gpu_info->n_dev_compatible = 0;
415 cudaError_t stat = cudaGetDeviceCount(&ndev);
416 if (stat != cudaSuccess)
418 GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
419 "canDetectGpus() was not called appropriately beforehand."));
422 // We expect to start device support/sanity checks with a clean runtime error state
423 gmx::ensureNoPendingCudaError("");
425 gmx_device_info_t *devs;
427 for (int i = 0; i < ndev; i++)
430 memset(&prop, 0, sizeof(cudaDeviceProp));
431 stat = cudaGetDeviceProperties(&prop, i);
433 if (stat != cudaSuccess)
435 // Will handle the error reporting below
436 checkResult = egpuInsane;
440 checkResult = is_gmx_supported_gpu_id(i, prop);
445 devs[i].stat = checkResult;
447 if (checkResult == egpuCompatible)
449 gpu_info->n_dev_compatible++;
454 // - we inspect the CUDA API state to retrieve and record any
455 // errors that occurred during is_gmx_supported_gpu_id() here,
456 // but this would be more elegant done within is_gmx_supported_gpu_id()
457 // and only return a string with the error if one was encountered.
458 // - we'll be reporting without rank information which is not ideal.
459 // - we'll end up warning also in cases where users would already
460 // get an error before mdrun aborts.
462 // Here we also clear the CUDA API error state so potential
463 // errors during sanity checks don't propagate.
464 if ((stat = cudaGetLastError()) != cudaSuccess)
466 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
467 devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
472 stat = cudaPeekAtLastError();
473 GMX_RELEASE_ASSERT(stat == cudaSuccess,
474 gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
475 cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
477 gpu_info->n_dev = ndev;
478 gpu_info->gpu_dev = devs;
481 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
485 if (index < 0 && index >= gpu_info.n_dev)
490 gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
492 bool bGpuExists = (dinfo->stat != egpuNonexistent &&
493 dinfo->stat != egpuInsane);
497 sprintf(s, "#%d: %s, stat: %s",
499 gpu_detect_res_str[dinfo->stat]);
503 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
504 dinfo->id, dinfo->prop.name,
505 dinfo->prop.major, dinfo->prop.minor,
506 dinfo->prop.ECCEnabled ? "yes" : " no",
507 gpu_detect_res_str[dinfo->stat]);
511 int get_current_cuda_gpu_device_id(void)
514 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
519 size_t sizeof_gpu_dev_info(void)
521 return sizeof(gmx_device_info_t);
524 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
525 gmx_host_alloc_t **nb_alloc,
526 gmx_host_free_t **nb_free)
530 *nb_alloc = &pmalloc;
540 void startGpuProfiler(void)
542 /* The NVPROF_ID environment variable is set by nvprof and indicates that
543 mdrun is executed in the CUDA profiler.
544 If nvprof was run is with "--profile-from-start off", the profiler will
545 be started here. This way we can avoid tracing the CUDA events from the
546 first part of the run. Starting the profiler again does nothing.
551 stat = cudaProfilerStart();
552 CU_RET_ERR(stat, "cudaProfilerStart failed");
556 void stopGpuProfiler(void)
558 /* Stopping the nvidia here allows us to eliminate the subsequent
559 API calls from the trace, e.g. uninitialization and cleanup. */
563 stat = cudaProfilerStop();
564 CU_RET_ERR(stat, "cudaProfilerStop failed");
568 void resetGpuProfiler(void)
570 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
571 * the profiling here (can't stop it) which will achieve the desired effect if
572 * the run was started with the profiling disabled.
574 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
583 int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
585 return info.gpu_dev[index].stat;