2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Defines the CUDA implementations of the device management.
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Mark Abraham <mark.j.abraham@gmail.com>
43 * \author Szilárd Páll <pall.szilard@gmail.com>
44 * \author Artem Zhmurov <zhmurov@gmail.com>
46 * \ingroup module_hardware
50 #include "device_management.h"
54 #include "gromacs/gpu_utils/cudautils.cuh"
55 #include "gromacs/gpu_utils/device_context.h"
56 #include "gromacs/gpu_utils/device_stream.h"
57 #include "gromacs/utility/exceptions.h"
58 #include "gromacs/utility/programcontext.h"
59 #include "gromacs/utility/smalloc.h"
60 #include "gromacs/utility/stringutil.h"
62 #include "device_information.h"
65 * Max number of devices supported by CUDA (for consistency checking).
67 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
69 static int c_cudaMaxDeviceCount = 32;
71 /** Dummy kernel used for sanity checking. */
72 static __global__ void dummy_kernel(void) {}
74 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
76 cudaFuncAttributes attributes;
77 cudaError_t stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
79 if (cudaErrorInvalidDeviceFunction == stat)
82 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
83 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
84 "By default, GROMACS supports all architectures of compute "
85 "capability >= 3.0, so your GPU "
86 "might be rare, or some architectures were disabled in the build. \n"
87 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
88 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
89 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
96 * \brief Runs GPU sanity checks.
98 * Runs a series of checks to determine that the given GPU and underlying CUDA
99 * driver/runtime functions properly.
101 * \todo Currently we do not make a distinction between the type of errors
102 * that can appear during functionality checks. This needs to be improved,
103 * e.g if the dummy test kernel fails to execute with a "device busy message"
104 * we should appropriately report that the device is busy instead of NonFunctional.
106 * \todo Introduce errors codes and handle errors more smoothly.
109 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
110 * \param[in] dev_prop The device properties structure
111 * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
113 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
118 cu_err = cudaGetDeviceCount(&dev_count);
119 if (cu_err != cudaSuccess)
121 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
122 return DeviceStatus::NonFunctional;
125 /* no CUDA compatible device at all */
128 return DeviceStatus::NonFunctional;
131 /* things might go horribly wrong if cudart is not compatible with the driver */
132 if (dev_count < 0 || dev_count > c_cudaMaxDeviceCount)
134 return DeviceStatus::NonFunctional;
137 if (dev_id == -1) /* device already selected let's not destroy the context */
139 cu_err = cudaGetDevice(&id);
140 if (cu_err != cudaSuccess)
142 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
143 return DeviceStatus::NonFunctional;
149 if (id > dev_count - 1) /* pfff there's no such device */
152 "The requested device with id %d does not seem to exist (device count=%d)\n",
154 return DeviceStatus::NonFunctional;
158 /* both major & minor is 9999 if no CUDA capable devices are present */
159 if (dev_prop.major == 9999 && dev_prop.minor == 9999)
161 return DeviceStatus::NonFunctional;
163 /* we don't care about emulation mode */
164 if (dev_prop.major == 0)
166 return DeviceStatus::NonFunctional;
171 cu_err = cudaSetDevice(id);
172 if (cu_err != cudaSuccess)
174 fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
175 cudaGetErrorString(cu_err));
176 return DeviceStatus::NonFunctional;
180 cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
181 // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
182 // it is enough to check for cudaErrorDevicesUnavailable only here because
183 // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
184 if (cu_err == cudaErrorDevicesUnavailable)
186 return DeviceStatus::Unavailable;
188 else if (cu_err != cudaSuccess)
190 return DeviceStatus::NonFunctional;
193 /* try to execute a dummy kernel */
196 KernelLaunchConfig config;
197 config.blockSize[0] = 512;
198 const auto dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
199 DeviceInformation deviceInfo;
200 const DeviceContext deviceContext(deviceInfo);
201 const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
202 launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
204 catch (gmx::GromacsException& ex)
206 // launchGpuKernel error is not fatal and should continue with marking the device bad
208 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
209 formatExceptionMessageToString(ex).c_str());
210 return DeviceStatus::NonFunctional;
213 if (cudaDeviceSynchronize() != cudaSuccess)
215 return DeviceStatus::NonFunctional;
218 /* destroy context if we created one */
221 cu_err = cudaDeviceReset();
222 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
225 return DeviceStatus::Compatible;
228 /*! \brief Returns true if the gpu characterized by the device properties is supported
229 * by the native gpu acceleration.
231 * \param[in] deviceProperties The CUDA device properties of the gpus to test.
232 * \returns True if the GPU properties passed indicate a compatible
233 * GPU, otherwise false.
235 static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
237 return (deviceProperties.major >= 3);
240 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
242 * Returns a status value which indicates compatibility or one of the following
243 * errors: incompatibility or insanity (=unexpected behavior).
245 * As the error handling only permits returning the state of the GPU, this function
246 * does not clear the CUDA runtime API status allowing the caller to inspect the error
247 * upon return. Note that this also means it is the caller's responsibility to
248 * reset the CUDA runtime state.
250 * \param[in] deviceId the ID of the GPU to check.
251 * \param[in] deviceProp the CUDA device properties of the device checked.
252 * \returns the status of the requested device
254 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
256 if (!isDeviceGenerationSupported(deviceProp))
258 return DeviceStatus::Incompatible;
260 return isDeviceFunctional(deviceId, deviceProp);
263 bool isDeviceDetectionFunctional(std::string* errorMessage)
266 int driverVersion = -1;
267 stat = cudaDriverGetVersion(&driverVersion);
268 GMX_ASSERT(stat != cudaErrorInvalidValue,
269 "An impossible null pointer was passed to cudaDriverGetVersion");
272 gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
273 cudaGetErrorName(stat), cudaGetErrorString(stat))
275 bool foundDriver = (driverVersion > 0);
278 // Can't detect GPUs if there is no driver
279 if (errorMessage != nullptr)
281 errorMessage->assign("No valid CUDA driver found");
287 stat = cudaGetDeviceCount(&numDevices);
288 if (stat != cudaSuccess)
290 if (errorMessage != nullptr)
292 /* cudaGetDeviceCount failed which means that there is
293 * something wrong with the machine: driver-runtime
294 * mismatch, all GPUs being busy in exclusive mode,
295 * invalid CUDA_VISIBLE_DEVICES, or some other condition
296 * which should result in GROMACS issuing at least a
298 errorMessage->assign(cudaGetErrorString(stat));
301 // Consume the error now that we have prepared to handle
302 // it. This stops it reappearing next time we check for
303 // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
304 // valid devices, then cudaGetLastError returns the
305 // (undocumented) cudaErrorNoDevice, but this should not be a
306 // problem as there should be no future CUDA API calls.
307 // NVIDIA bug report #2038718 has been filed.
313 // We don't actually use numDevices here, that's not the job of
318 std::vector<std::unique_ptr<DeviceInformation>> findDevices()
321 cudaError_t stat = cudaGetDeviceCount(&numDevices);
322 if (stat != cudaSuccess)
324 GMX_THROW(gmx::InternalError(
325 "Invalid call of findDevices() when CUDA API returned an error, perhaps "
326 "canPerformDeviceDetection() was not called appropriately beforehand."));
329 // We expect to start device support/sanity checks with a clean runtime error state
330 gmx::ensureNoPendingCudaError("");
332 std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
333 for (int i = 0; i < numDevices; i++)
336 memset(&prop, 0, sizeof(cudaDeviceProp));
337 stat = cudaGetDeviceProperties(&prop, i);
338 const DeviceStatus checkResult =
339 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
341 deviceInfoList[i] = std::make_unique<DeviceInformation>();
343 deviceInfoList[i]->id = i;
344 deviceInfoList[i]->prop = prop;
345 deviceInfoList[i]->status = checkResult;
347 if (checkResult != DeviceStatus::Compatible)
350 // - we inspect the CUDA API state to retrieve and record any
351 // errors that occurred during is_gmx_supported_gpu_id() here,
352 // but this would be more elegant done within is_gmx_supported_gpu_id()
353 // and only return a string with the error if one was encountered.
354 // - we'll be reporting without rank information which is not ideal.
355 // - we'll end up warning also in cases where users would already
356 // get an error before mdrun aborts.
358 // Here we also clear the CUDA API error state so potential
359 // errors during sanity checks don't propagate.
360 if ((stat = cudaGetLastError()) != cudaSuccess)
362 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
363 deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
368 stat = cudaPeekAtLastError();
369 GMX_RELEASE_ASSERT(stat == cudaSuccess,
370 gmx::formatString("We promise to return with clean CUDA state, but "
371 "non-success state encountered: %s: %s",
372 cudaGetErrorName(stat), cudaGetErrorString(stat))
375 return deviceInfoList;
378 void setActiveDevice(const DeviceInformation& deviceInfo)
380 int deviceId = deviceInfo.id;
383 stat = cudaSetDevice(deviceId);
384 if (stat != cudaSuccess)
386 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
387 CU_RET_ERR(stat, message.c_str());
392 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
396 void releaseDevice(DeviceInformation* deviceInfo)
398 // device was used is that deviceInfo will be non-null.
399 if (deviceInfo != nullptr)
404 stat = cudaGetDevice(&gpuid);
405 if (stat == cudaSuccess)
409 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
412 stat = cudaDeviceReset();
413 if (stat != cudaSuccess)
415 gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
421 std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
423 bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
424 && deviceInfo.status != DeviceStatus::NonFunctional);
428 return gmx::formatString("#%d: %s, stat: %s", deviceInfo.id, "N/A",
429 c_deviceStateString[deviceInfo.status]);
433 return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
434 deviceInfo.id, deviceInfo.prop.name, deviceInfo.prop.major,
435 deviceInfo.prop.minor, deviceInfo.prop.ECCEnabled ? "yes" : " no",
436 c_deviceStateString[deviceInfo.status]);