2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, The GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define functions for detection and initialization for CUDA devices.
39 * \author Szilard Pall <pall.szilard@gmail.com>
44 #include "gpu_utils.h"
50 #include <cuda_profiler_api.h>
52 #include "gromacs/gpu_utils/cudautils.cuh"
53 #include "gromacs/gpu_utils/device_context.h"
54 #include "gromacs/gpu_utils/device_stream.h"
55 #include "gromacs/gpu_utils/pmalloc_cuda.h"
56 #include "gromacs/hardware/gpu_hw_info.h"
57 #include "gromacs/utility/basedefinitions.h"
58 #include "gromacs/utility/cstringutil.h"
59 #include "gromacs/utility/exceptions.h"
60 #include "gromacs/utility/fatalerror.h"
61 #include "gromacs/utility/gmxassert.h"
62 #include "gromacs/utility/logger.h"
63 #include "gromacs/utility/programcontext.h"
64 #include "gromacs/utility/smalloc.h"
65 #include "gromacs/utility/snprintf.h"
66 #include "gromacs/utility/stringutil.h"
69 * Max number of devices supported by CUDA (for consistency checking).
71 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
73 static int cuda_max_device_count = 32;
75 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
77 /** Dummy kernel used for sanity checking. */
78 static __global__ void k_dummy_test(void) {}
80 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
82 cudaFuncAttributes attributes;
83 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
85 if (cudaErrorInvalidDeviceFunction == stat)
88 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
89 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
90 "By default, GROMACS supports all architectures of compute "
91 "capability >= 3.0, so your GPU "
92 "might be rare, or some architectures were disabled in the build. \n"
93 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
94 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
95 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
101 bool isHostMemoryPinned(const void* h_ptr)
103 cudaPointerAttributes memoryAttributes;
104 cudaError_t stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
109 case cudaSuccess: result = true; break;
111 case cudaErrorInvalidValue:
112 // If the buffer was not pinned, then it will not be recognized by CUDA at all
114 // Reset the last error status
118 default: CU_RET_ERR(stat, "Unexpected CUDA error");
124 * \brief Runs GPU sanity checks.
126 * Runs a series of checks to determine that the given GPU and underlying CUDA
127 * driver/runtime functions properly.
129 * \todo Currently we do not make a distinction between the type of errors
130 * that can appear during functionality checks. This needs to be improved,
131 * e.g if the dummy test kernel fails to execute with a "device busy message"
132 * we should appropriately report that the device is busy instead of NonFunctional.
134 * \todo Introduce errors codes and handle errors more smoothly.
137 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
138 * \param[in] dev_prop The device properties structure
139 * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
141 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
146 cu_err = cudaGetDeviceCount(&dev_count);
147 if (cu_err != cudaSuccess)
149 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
150 return DeviceStatus::NonFunctional;
153 /* no CUDA compatible device at all */
156 return DeviceStatus::NonFunctional;
159 /* things might go horribly wrong if cudart is not compatible with the driver */
160 if (dev_count < 0 || dev_count > cuda_max_device_count)
162 return DeviceStatus::NonFunctional;
165 if (dev_id == -1) /* device already selected let's not destroy the context */
167 cu_err = cudaGetDevice(&id);
168 if (cu_err != cudaSuccess)
170 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
171 return DeviceStatus::NonFunctional;
177 if (id > dev_count - 1) /* pfff there's no such device */
180 "The requested device with id %d does not seem to exist (device count=%d)\n",
182 return DeviceStatus::NonFunctional;
186 /* both major & minor is 9999 if no CUDA capable devices are present */
187 if (dev_prop.major == 9999 && dev_prop.minor == 9999)
189 return DeviceStatus::NonFunctional;
191 /* we don't care about emulation mode */
192 if (dev_prop.major == 0)
194 return DeviceStatus::NonFunctional;
199 cu_err = cudaSetDevice(id);
200 if (cu_err != cudaSuccess)
202 fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
203 cudaGetErrorString(cu_err));
204 return DeviceStatus::NonFunctional;
208 cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
209 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
210 // it is enough to check for cudaErrorDevicesUnavailable only here because
211 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
212 if (cu_err == cudaErrorDevicesUnavailable)
214 return DeviceStatus::Unavailable;
216 else if (cu_err != cudaSuccess)
218 return DeviceStatus::NonFunctional;
221 /* try to execute a dummy kernel */
224 KernelLaunchConfig config;
225 config.blockSize[0] = 512;
226 const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
227 DeviceInformation deviceInfo;
228 const DeviceContext deviceContext(deviceInfo);
229 const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
230 launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
232 catch (gmx::GromacsException& ex)
234 // launchGpuKernel error is not fatal and should continue with marking the device bad
236 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
237 formatExceptionMessageToString(ex).c_str());
238 return DeviceStatus::NonFunctional;
241 if (cudaDeviceSynchronize() != cudaSuccess)
243 return DeviceStatus::NonFunctional;
246 /* destroy context if we created one */
249 cu_err = cudaDeviceReset();
250 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
253 return DeviceStatus::Compatible;
256 void init_gpu(const DeviceInformation* deviceInfo)
262 stat = cudaSetDevice(deviceInfo->id);
263 if (stat != cudaSuccess)
265 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
266 CU_RET_ERR(stat, message.c_str());
271 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
275 void free_gpu(const DeviceInformation* deviceInfo)
277 // One should only attempt to clear the device context when
278 // it has been used, but currently the only way to know that a GPU
279 // device was used is that deviceInfo will be non-null.
280 if (deviceInfo == nullptr)
290 stat = cudaGetDevice(&gpuid);
291 CU_RET_ERR(stat, "cudaGetDevice failed");
292 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
295 stat = cudaDeviceReset();
296 if (stat != cudaSuccess)
298 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
302 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
304 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
306 gmx_incons("Invalid GPU deviceId requested");
308 return &gpu_info.deviceInfo[deviceId];
311 /*! \brief Returns true if the gpu characterized by the device properties is
312 * supported by the native gpu acceleration.
314 * \param[in] dev_prop the CUDA device properties of the gpus to test.
315 * \returns true if the GPU properties passed indicate a compatible
316 * GPU, otherwise false.
318 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
320 return (dev_prop.major >= 3);
323 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
325 * Returns a status value which indicates compatibility or one of the following
326 * errors: incompatibility or insanity (=unexpected behavior).
328 * As the error handling only permits returning the state of the GPU, this function
329 * does not clear the CUDA runtime API status allowing the caller to inspect the error
330 * upon return. Note that this also means it is the caller's responsibility to
331 * reset the CUDA runtime state.
333 * \param[in] deviceId the ID of the GPU to check.
334 * \param[in] deviceProp the CUDA device properties of the device checked.
335 * \returns the status of the requested device
337 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
339 if (!is_gmx_supported_gpu(deviceProp))
341 return DeviceStatus::Incompatible;
343 return isDeviceFunctional(deviceId, deviceProp);
346 bool isGpuDetectionFunctional(std::string* errorMessage)
349 int driverVersion = -1;
350 stat = cudaDriverGetVersion(&driverVersion);
351 GMX_ASSERT(stat != cudaErrorInvalidValue,
352 "An impossible null pointer was passed to cudaDriverGetVersion");
355 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
356 cudaGetErrorName(stat), cudaGetErrorString(stat))
358 bool foundDriver = (driverVersion > 0);
361 // Can't detect GPUs if there is no driver
362 if (errorMessage != nullptr)
364 errorMessage->assign("No valid CUDA driver found");
370 stat = cudaGetDeviceCount(&numDevices);
371 if (stat != cudaSuccess)
373 if (errorMessage != nullptr)
375 /* cudaGetDeviceCount failed which means that there is
376 * something wrong with the machine: driver-runtime
377 * mismatch, all GPUs being busy in exclusive mode,
378 * invalid CUDA_VISIBLE_DEVICES, or some other condition
379 * which should result in GROMACS issuing at least a
381 errorMessage->assign(cudaGetErrorString(stat));
384 // Consume the error now that we have prepared to handle
385 // it. This stops it reappearing next time we check for
386 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
387 // valid devices, then cudaGetLastError returns the
388 // (undocumented) cudaErrorNoDevice, but this should not be a
389 // problem as there should be no future CUDA API calls.
390 // NVIDIA bug report #2038718 has been filed.
396 // We don't actually use numDevices here, that's not the job of
401 void findGpus(gmx_gpu_info_t* gpu_info)
405 gpu_info->n_dev_compatible = 0;
408 cudaError_t stat = cudaGetDeviceCount(&ndev);
409 if (stat != cudaSuccess)
411 GMX_THROW(gmx::InternalError(
412 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
413 "canDetectGpus() was not called appropriately beforehand."));
416 // We expect to start device support/sanity checks with a clean runtime error state
417 gmx::ensureNoPendingCudaError("");
419 DeviceInformation* devs;
421 for (int i = 0; i < ndev; i++)
424 memset(&prop, 0, sizeof(cudaDeviceProp));
425 stat = cudaGetDeviceProperties(&prop, i);
426 const DeviceStatus checkResult =
427 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
431 devs[i].stat = checkResult;
433 if (checkResult == DeviceStatus::Compatible)
435 gpu_info->n_dev_compatible++;
440 // - we inspect the CUDA API state to retrieve and record any
441 // errors that occurred during is_gmx_supported_gpu_id() here,
442 // but this would be more elegant done within is_gmx_supported_gpu_id()
443 // and only return a string with the error if one was encountered.
444 // - we'll be reporting without rank information which is not ideal.
445 // - we'll end up warning also in cases where users would already
446 // get an error before mdrun aborts.
448 // Here we also clear the CUDA API error state so potential
449 // errors during sanity checks don't propagate.
450 if ((stat = cudaGetLastError()) != cudaSuccess)
452 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
453 devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
458 stat = cudaPeekAtLastError();
459 GMX_RELEASE_ASSERT(stat == cudaSuccess,
460 gmx::formatString("We promise to return with clean CUDA state, but "
461 "non-success state encountered: %s: %s",
462 cudaGetErrorName(stat), cudaGetErrorString(stat))
465 gpu_info->n_dev = ndev;
466 gpu_info->deviceInfo = devs;
469 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
473 if (index < 0 && index >= gpu_info.n_dev)
478 DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
481 (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
485 sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
489 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
490 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
491 dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
495 int get_current_cuda_gpu_device_id(void)
498 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
503 size_t sizeof_gpu_dev_info(void)
505 return sizeof(DeviceInformation);
508 void startGpuProfiler(void)
510 /* The NVPROF_ID environment variable is set by nvprof and indicates that
511 mdrun is executed in the CUDA profiler.
512 If nvprof was run is with "--profile-from-start off", the profiler will
513 be started here. This way we can avoid tracing the CUDA events from the
514 first part of the run. Starting the profiler again does nothing.
519 stat = cudaProfilerStart();
520 CU_RET_ERR(stat, "cudaProfilerStart failed");
524 void stopGpuProfiler(void)
526 /* Stopping the nvidia here allows us to eliminate the subsequent
527 API calls from the trace, e.g. uninitialization and cleanup. */
531 stat = cudaProfilerStop();
532 CU_RET_ERR(stat, "cudaProfilerStop failed");
536 void resetGpuProfiler(void)
538 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
539 * the profiling here (can't stop it) which will achieve the desired effect if
540 * the run was started with the profiling disabled.
542 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
551 DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
553 return info.deviceInfo[index].stat;
556 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
557 * \param[in] stat CUDA call return status
558 * \param[in] gpuA ID for GPU initiating peer access call
559 * \param[in] gpuB ID for remote GPU
560 * \param[in] mdlog Logger object
561 * \param[in] cudaCallName name of CUDA peer access call
563 static void peerAccessCheckStat(const cudaError_t stat,
566 const gmx::MDLogger& mdlog,
567 const char* cudaCallName)
569 if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
571 std::string errorString =
572 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
573 CU_RET_ERR(stat, errorString.c_str());
575 if (stat != cudaSuccess)
577 GMX_LOG(mdlog.warning)
579 .appendTextFormatted(
580 "GPU peer access not enabled between GPUs %d and %d due to unexpected "
581 "return value from %s: %s",
582 gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
586 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
590 // take a note of currently-set GPU
592 stat = cudaGetDevice(¤tGpu);
593 CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
595 std::string message = gmx::formatString(
596 "Note: Peer access enabled between the following GPU pairs in the node:\n ");
597 bool peerAccessEnabled = false;
599 for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
601 int gpuA = gpuIdsToUse[i];
602 stat = cudaSetDevice(gpuA);
603 if (stat != cudaSuccess)
605 GMX_LOG(mdlog.warning)
607 .appendTextFormatted(
608 "GPU peer access not enabled due to unexpected return value from "
609 "cudaSetDevice(%d): %s",
610 gpuA, cudaGetErrorString(stat));
613 for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
617 int gpuB = gpuIdsToUse[j];
618 int canAccessPeer = 0;
619 stat = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
620 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
624 stat = cudaDeviceEnablePeerAccess(gpuB, 0);
625 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
627 message = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
628 peerAccessEnabled = true;
634 // re-set GPU to that originally set
635 stat = cudaSetDevice(currentGpu);
636 if (stat != cudaSuccess)
638 CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
642 if (peerAccessEnabled)
644 GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());