2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Defines the CUDA implementations of the device management.
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Mark Abraham <mark.j.abraham@gmail.com>
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \author Artem Zhmurov <zhmurov@gmail.com>
46 * \ingroup module_hardware
50 #include "device_management.h"
54 #include "gromacs/gpu_utils/cudautils.cuh"
55 #include "gromacs/gpu_utils/device_context.h"
56 #include "gromacs/gpu_utils/device_stream.h"
57 #include "gromacs/utility/exceptions.h"
58 #include "gromacs/utility/programcontext.h"
59 #include "gromacs/utility/smalloc.h"
60 #include "gromacs/utility/stringutil.h"
62 #include "device_information.h"
65 * Max number of devices supported by CUDA (for consistency checking).
67 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
69 static const int c_cudaMaxDeviceCount = 32;
71 /** Dummy kernel used for sanity checking. */
72 static __global__ void dummy_kernel() {}
74 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
76 cudaFuncAttributes attributes;
77 cudaError_t stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
79 if (cudaErrorInvalidDeviceFunction == stat)
82 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
83 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
84 "By default, GROMACS supports all architectures of compute "
85 "capability >= 3.0, so your GPU "
86 "might be rare, or some architectures were disabled in the build. \n"
87 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
88 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
89 gmx::getProgramContext().displayName(),
99 * \brief Runs GPU sanity checks.
101 * Runs a series of checks to determine that the given GPU and underlying CUDA
102 * driver/runtime functions properly.
104 * \todo Currently we do not make a distinction between the type of errors
105 * that can appear during functionality checks. This needs to be improved,
106 * e.g if the dummy test kernel fails to execute with a "device busy message"
107 * we should appropriately report that the device is busy instead of NonFunctional.
109 * \todo Introduce errors codes and handle errors more smoothly.
112 * \param[in] deviceInfo Device information on the device to check.
113 * \returns The status enumeration value for the checked device:
115 static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
119 /* both major & minor is 9999 if no CUDA capable devices are present */
120 if (deviceInfo.prop.major == 9999 && deviceInfo.prop.minor == 9999)
122 return DeviceStatus::NonFunctional;
124 /* we don't care about emulation mode */
125 if (deviceInfo.prop.major == 0)
127 return DeviceStatus::NonFunctional;
130 cu_err = cudaSetDevice(deviceInfo.id);
131 if (cu_err != cudaSuccess)
134 "Error while switching to device #%d. %s\n",
136 gmx::getDeviceErrorString(cu_err).c_str());
137 return DeviceStatus::NonFunctional;
140 cu_err = checkCompiledTargetCompatibility(deviceInfo.id, deviceInfo.prop);
141 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
142 // it is enough to check for cudaErrorDevicesUnavailable only here because
143 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
144 if (cu_err == cudaErrorDevicesUnavailable)
146 return DeviceStatus::Unavailable;
148 else if (cu_err != cudaSuccess)
150 return DeviceStatus::NonFunctional;
153 /* try to execute a dummy kernel */
156 KernelLaunchConfig config;
157 config.blockSize[0] = 512;
158 const auto dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
159 const DeviceContext deviceContext(deviceInfo);
160 const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
161 launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
163 catch (gmx::GromacsException& ex)
165 // launchGpuKernel error is not fatal and should continue with marking the device bad
167 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
169 formatExceptionMessageToString(ex).c_str());
170 return DeviceStatus::NonFunctional;
173 if (cudaDeviceSynchronize() != cudaSuccess)
175 return DeviceStatus::NonFunctional;
178 cu_err = cudaDeviceReset();
179 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
181 return DeviceStatus::Compatible;
184 /*! \brief Returns true if the gpu characterized by the device properties is supported
185 * by the native gpu acceleration.
187 * \param[in] deviceProperties The CUDA device properties of the gpus to test.
188 * \returns True if the GPU properties passed indicate a compatible
189 * GPU, otherwise false.
191 static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
193 return (deviceProperties.major >= 3);
196 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
198 * Returns a status value which indicates compatibility or one of the following
199 * errors: incompatibility or insanity (=unexpected behavior).
201 * As the error handling only permits returning the state of the GPU, this function
202 * does not clear the CUDA runtime API status allowing the caller to inspect the error
203 * upon return. Note that this also means it is the caller's responsibility to
204 * reset the CUDA runtime state.
206 * \param[in] deviceInfo The device information on the device to check.
207 * \returns the status of the requested device
209 static DeviceStatus checkDeviceStatus(const DeviceInformation& deviceInfo)
211 if (!isDeviceGenerationSupported(deviceInfo.prop))
213 return DeviceStatus::Incompatible;
215 return isDeviceFunctional(deviceInfo);
218 bool isDeviceDetectionFunctional(std::string* errorMessage)
221 int driverVersion = -1;
222 stat = cudaDriverGetVersion(&driverVersion);
223 GMX_ASSERT(stat != cudaErrorInvalidValue,
224 "An impossible null pointer was passed to cudaDriverGetVersion");
225 GMX_RELEASE_ASSERT(stat == cudaSuccess,
226 ("An unexpected value was returned from cudaDriverGetVersion. "
227 + gmx::getDeviceErrorString(stat))
229 bool foundDriver = (driverVersion > 0);
232 // Can't detect GPUs if there is no driver
233 if (errorMessage != nullptr)
235 errorMessage->assign("No valid CUDA driver found");
241 stat = cudaGetDeviceCount(&numDevices);
242 if (stat != cudaSuccess)
244 if (errorMessage != nullptr)
246 /* cudaGetDeviceCount failed which means that there is
247 * something wrong with the machine: driver-runtime
248 * mismatch, all GPUs being busy in exclusive mode,
249 * invalid CUDA_VISIBLE_DEVICES, or some other condition
250 * which should result in GROMACS issuing at least a
252 errorMessage->assign(cudaGetErrorString(stat));
255 // Consume the error now that we have prepared to handle
256 // it. This stops it reappearing next time we check for
257 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
258 // valid devices, then cudaGetLastError returns the
259 // (undocumented) cudaErrorNoDevice, but this should not be a
260 // problem as there should be no future CUDA API calls.
261 // NVIDIA bug report #2038718 has been filed.
267 // We don't actually use numDevices here, that's not the job of
272 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
275 cudaError_t stat = cudaGetDeviceCount(&numDevices);
276 gmx::checkDeviceError(stat,
277 "Invalid call of findDevices() when CUDA API returned an error, perhaps "
278 "canPerformDeviceDetection() was not called appropriately beforehand.");
280 /* things might go horribly wrong if cudart is not compatible with the driver */
281 numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
283 // We expect to start device support/sanity checks with a clean runtime error state
284 gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
286 std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
287 for (int i = 0; i < numDevices; i++)
290 memset(&prop, 0, sizeof(cudaDeviceProp));
291 stat = cudaGetDeviceProperties(&prop, i);
293 deviceInfoList[i] = std::make_unique<DeviceInformation>();
294 deviceInfoList[i]->id = i;
295 deviceInfoList[i]->prop = prop;
296 deviceInfoList[i]->deviceVendor = DeviceVendor::Nvidia;
298 const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional
299 : checkDeviceStatus(*deviceInfoList[i]);
301 deviceInfoList[i]->status = checkResult;
303 if (checkResult != DeviceStatus::Compatible)
306 // - we inspect the CUDA API state to retrieve and record any
307 // errors that occurred during is_gmx_supported_gpu_id() here,
308 // but this would be more elegant done within is_gmx_supported_gpu_id()
309 // and only return a string with the error if one was encountered.
310 // - we'll be reporting without rank information which is not ideal.
311 // - we'll end up warning also in cases where users would already
312 // get an error before mdrun aborts.
314 // Here we also clear the CUDA API error state so potential
315 // errors during sanity checks don't propagate.
316 const std::string errorMessage = gmx::formatString(
317 "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
318 gmx::ensureNoPendingDeviceError(errorMessage);
322 stat = cudaPeekAtLastError();
325 ("We promise to return with clean CUDA state, but non-success state encountered. "
326 + gmx::getDeviceErrorString(stat))
329 return deviceInfoList;
332 void setActiveDevice(const DeviceInformation& deviceInfo)
334 int deviceId = deviceInfo.id;
337 stat = cudaSetDevice(deviceId);
338 if (stat != cudaSuccess)
340 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
341 CU_RET_ERR(stat, message);
346 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
350 void releaseDevice(DeviceInformation* deviceInfo)
352 // device was used is that deviceInfo will be non-null.
353 if (deviceInfo != nullptr)
358 stat = cudaGetDevice(&gpuid);
359 if (stat == cudaSuccess)
363 fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
366 stat = cudaDeviceReset();
367 if (stat != cudaSuccess)
369 gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
375 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
377 bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
378 && deviceInfo.status != DeviceStatus::NonFunctional);
382 return gmx::formatString(
383 "#%d: %s, stat: %s", deviceInfo.id, "N/A", c_deviceStateString[deviceInfo.status]);
387 return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
389 deviceInfo.prop.name,
390 deviceInfo.prop.major,
391 deviceInfo.prop.minor,
392 deviceInfo.prop.ECCEnabled ? "yes" : " no",
393 c_deviceStateString[deviceInfo.status]);