2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
51 #include <cuda_profiler_api.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
55 #include "gromacs/hardware/gpu_hw_info.h"
56 #include "gromacs/utility/basedefinitions.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/fatalerror.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/logger.h"
61 #include "gromacs/utility/programcontext.h"
62 #include "gromacs/utility/smalloc.h"
63 #include "gromacs/utility/snprintf.h"
64 #include "gromacs/utility/stringutil.h"
68 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
70 #define HAVE_NVML_APPLICATION_CLOCKS 0
71 #endif /* HAVE_NVML */
73 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
74 /*! Check for NVML error on the return status of a NVML API call. */
75 # define HANDLE_NVML_RET_ERR(status, msg) \
77 if (status != NVML_SUCCESS) \
79 gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
82 #else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
83 # define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
84 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
86 #if HAVE_NVML_APPLICATION_CLOCKS
87 static const gmx_bool bCompiledWithApplicationClockSupport = true;
89 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
93 * Max number of devices supported by CUDA (for consistency checking).
95 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
97 static int cuda_max_device_count = 32;
99 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
101 /** Dummy kernel used for sanity checking. */
102 static __global__ void k_dummy_test(void)
106 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
110 cudaFuncAttributes attributes;
111 cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
113 if (cudaErrorInvalidDeviceFunction == stat)
116 "The %s binary was not compiled for the selected GPU "
117 "(device ID #%d, compute capability %d.%d).\n"
118 "When selecting target GPU architectures with GMX_CUDA_TARGET_SM, "
119 "make sure to pass the appropriate architecture(s) corresponding to the "
120 "device(s) intended to be used (see in the GPU info listing) or alternatively "
121 "pass in GMX_CUDA_TARGET_COMPUTE an appropriate virtual architecture. ",
122 gmx::getProgramContext().displayName(), devInfo->id,
123 devInfo->prop.major, devInfo->prop.minor);
126 CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
128 if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
131 "The GPU device code was compiled at runtime from 2.0 source which is "
132 "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
133 "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
135 devInfo->prop.major, devInfo->prop.minor);
139 bool isHostMemoryPinned(void *h_ptr)
141 cudaPointerAttributes memoryAttributes;
142 cudaError_t stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
151 case cudaErrorInvalidValue:
152 // If the buffer was not pinned, then it will not be recognized by CUDA at all
154 // Reset the last error status
159 CU_RET_ERR(stat, "Unexpected CUDA error");
165 * \brief Runs GPU sanity checks.
167 * Runs a series of checks to determine that the given GPU and underlying CUDA
168 * driver/runtime functions properly.
169 * Returns properties of a device with given ID or the one that has
170 * already been initialized earlier in the case if of \dev_id == -1.
172 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
173 * \param[out] dev_prop pointer to the structure in which the device properties will be returned
174 * \returns 0 if the device looks OK
176 * TODO: introduce errors codes and handle errors more smoothly.
178 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
183 cu_err = cudaGetDeviceCount(&dev_count);
184 if (cu_err != cudaSuccess)
186 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
187 cudaGetErrorString(cu_err));
191 /* no CUDA compatible device at all */
197 /* things might go horribly wrong if cudart is not compatible with the driver */
198 if (dev_count < 0 || dev_count > cuda_max_device_count)
203 if (dev_id == -1) /* device already selected let's not destroy the context */
205 cu_err = cudaGetDevice(&id);
206 if (cu_err != cudaSuccess)
208 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
209 cudaGetErrorString(cu_err));
216 if (id > dev_count - 1) /* pfff there's no such device */
218 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
224 memset(dev_prop, 0, sizeof(cudaDeviceProp));
225 cu_err = cudaGetDeviceProperties(dev_prop, id);
226 if (cu_err != cudaSuccess)
228 fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
229 cudaGetErrorString(cu_err));
233 /* both major & minor is 9999 if no CUDA capable devices are present */
234 if (dev_prop->major == 9999 && dev_prop->minor == 9999)
238 /* we don't care about emulation mode */
239 if (dev_prop->major == 0)
246 cu_err = cudaSetDevice(id);
247 if (cu_err != cudaSuccess)
249 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
250 cu_err, id, cudaGetErrorString(cu_err));
255 /* try to execute a dummy kernel */
256 k_dummy_test<<< 1, 512>>> ();
257 if (cudaThreadSynchronize() != cudaSuccess)
262 /* destroy context if we created one */
265 cu_err = cudaDeviceReset();
266 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
272 #if HAVE_NVML_APPLICATION_CLOCKS
273 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
275 * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
276 * matching PCI-E information from \cuda_dev with the available NVML devices.
278 * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
279 * \returns true if \cuda_dev could be enriched with matching NVML device information.
281 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
283 nvmlDevice_t nvml_device_id;
284 unsigned int nvml_device_count = 0;
285 nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
286 bool nvmlWasInitialized = false;
287 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
288 for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
290 nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
291 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
292 if (nvml_stat != NVML_SUCCESS)
297 nvmlPciInfo_t nvml_pci_info;
298 nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
299 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
300 if (nvml_stat != NVML_SUCCESS)
304 if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
305 static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
306 static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
308 nvmlWasInitialized = true;
309 cuda_dev->nvml_device_id = nvml_device_id;
313 return nvmlWasInitialized;
316 /*! \brief Reads and returns the application clocks for device.
318 * \param[in] device The GPU device
319 * \param[out] app_sm_clock The current application SM clock
320 * \param[out] app_mem_clock The current application memory clock
321 * \returns if applacation clocks are supported
323 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
324 unsigned int *app_sm_clock,
325 unsigned int *app_mem_clock)
327 nvmlReturn_t nvml_stat;
329 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
330 if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
334 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
335 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
336 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
340 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
342 /*! \brief Tries to set application clocks for the GPU with the given index.
344 * Application clocks are set to the max supported value to increase
345 * performance if application clock permissions allow this. For future
346 * GPU architectures a more sophisticated scheme might be required.
348 * \todo Refactor this into a detection phase and a work phase. Also
349 * refactor to remove compile-time dependence on logging header.
351 * \param mdlog log file to write to
352 * \param[in] cuda_dev GPU device info for the GPU in use
353 * \returns true if no error occurs during application clocks handling.
355 static gmx_bool init_gpu_application_clocks(
356 const gmx::MDLogger &mdlog,
357 gmx_device_info_t *cuda_dev)
359 const cudaDeviceProp *prop = &cuda_dev->prop;
360 int cuda_compute_capability = prop->major * 10 + prop->minor;
361 gmx_bool bGpuCanUseApplicationClocks =
362 ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
363 (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
364 if (!bGpuCanUseApplicationClocks)
369 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
370 "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
371 " application clocks of the detected %s GPU to improve performance.\n"
372 " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
376 if (!bCompiledWithApplicationClockSupport)
378 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
379 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
380 " managing application clocks of the detected %s GPU to improve performance.\n"
381 " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
386 /* We've compiled with NVML application clocks support, and have a GPU that can use it */
387 nvmlReturn_t nvml_stat = NVML_SUCCESS;
389 //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
390 // this variable can be later used to give a user more fine grained control.
391 env = getenv("GMX_GPU_APPLICATION_CLOCKS");
392 if (env != NULL && ( strcmp( env, "0") == 0 ||
393 gmx_strcasecmp( env, "OFF") == 0 ||
394 gmx_strcasecmp( env, "DISABLE") == 0 ))
398 nvml_stat = nvmlInit();
399 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
400 if (nvml_stat != NVML_SUCCESS)
405 if (!addNVMLDeviceId(cuda_dev))
409 //get current application clocks setting
410 if (!getApplicationClocks(cuda_dev,
411 &cuda_dev->nvml_orig_app_sm_clock,
412 &cuda_dev->nvml_orig_app_mem_clock))
416 //get max application clocks
417 unsigned int max_sm_clock = 0;
418 unsigned int max_mem_clock = 0;
419 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
420 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
421 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
422 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
424 cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
425 cuda_dev->nvml_app_clocks_changed = false;
427 if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
429 //TODO: This should probably be integrated into the GPU Properties table.
430 GMX_LOG(mdlog.info).appendTextFormatted(
431 "Application clocks (GPU clocks) for %s are (%d,%d)",
432 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
436 if (cuda_compute_capability >= 60)
438 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
439 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
440 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
444 nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
445 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
447 if (nvml_stat != NVML_SUCCESS)
449 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
450 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
451 nvml_stat, nvmlErrorString(nvml_stat));
455 if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
457 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
458 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
459 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
463 /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
464 e.g. if max application clocks should not be used for certain GPUs. */
465 GMX_LOG(mdlog.warning).appendTextFormatted(
466 "Changing GPU application clocks for %s to (%d,%d)",
467 cuda_dev->prop.name, max_mem_clock, max_sm_clock);
468 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
469 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
470 cuda_dev->nvml_app_clocks_changed = true;
471 cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
472 cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
475 #endif /* HAVE_NVML */
478 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
480 * \param[in] gpu_dev CUDA device information
482 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
484 #if !HAVE_NVML_APPLICATION_CLOCKS
485 GMX_UNUSED_VALUE(cuda_dev);
487 #else /* HAVE_NVML_APPLICATION_CLOCKS */
488 nvmlReturn_t nvml_stat = NVML_SUCCESS;
490 cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
491 cuda_dev->nvml_app_clocks_changed)
493 /* Check if the clocks are still what we set them to.
494 * If so, set them back to the state we originally found them in.
495 * If not, don't touch them, because something else set them later.
497 unsigned int app_sm_clock, app_mem_clock;
498 getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
499 if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
500 app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
502 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
503 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
506 nvml_stat = nvmlShutdown();
507 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
508 return (nvml_stat == NVML_SUCCESS);
509 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
512 void init_gpu(const gmx::MDLogger &mdlog,
513 gmx_device_info_t *deviceInfo)
519 stat = cudaSetDevice(deviceInfo->id);
520 if (stat != cudaSuccess)
522 auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
523 CU_RET_ERR(stat, message.c_str());
528 fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
531 checkCompiledTargetCompatibility(deviceInfo);
533 //Ignoring return value as NVML errors should be treated not critical.
534 init_gpu_application_clocks(mdlog, deviceInfo);
537 void free_gpu(const gmx_device_info_t *deviceInfo)
539 // One should only attempt to clear the device context when
540 // it has been used, but currently the only way to know that a GPU
541 // device was used is that deviceInfo will be non-null.
542 if (deviceInfo == nullptr)
552 stat = cudaGetDevice(&gpuid);
553 CU_RET_ERR(stat, "cudaGetDevice failed");
554 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
557 if (!reset_gpu_application_clocks(deviceInfo))
559 gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
562 stat = cudaDeviceReset();
563 if (stat != cudaSuccess)
565 gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
569 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
572 if (deviceId < 0 || deviceId >= gpu_info.n_dev)
574 gmx_incons("Invalid GPU deviceId requested");
576 return &gpu_info.gpu_dev[deviceId];
579 /*! \brief Returns true if the gpu characterized by the device properties is
580 * supported by the native gpu acceleration.
582 * \param[in] dev_prop the CUDA device properties of the gpus to test.
583 * \returns true if the GPU properties passed indicate a compatible
584 * GPU, otherwise false.
586 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
588 return (dev_prop->major >= 2);
591 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
593 * Returns a status value which indicates compatibility or one of the following
594 * errors: incompatibility, insistence, or insanity (=unexpected behavior).
595 * It also returns the respective device's properties in \dev_prop (if applicable).
597 * \param[in] dev_id the ID of the GPU to check.
598 * \param[out] dev_prop the CUDA device properties of the device checked.
599 * \returns the status of the requested device
601 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
606 stat = cudaGetDeviceCount(&ndev);
607 if (stat != cudaSuccess)
612 if (dev_id > ndev - 1)
614 return egpuNonexistent;
617 /* TODO: currently we do not make a distinction between the type of errors
618 * that can appear during sanity checks. This needs to be improved, e.g if
619 * the dummy test kernel fails to execute with a "device busy message" we
620 * should appropriately report that the device is busy instead of insane.
622 if (do_sanity_checks(dev_id, dev_prop) == 0)
624 if (is_gmx_supported_gpu(dev_prop))
626 return egpuCompatible;
630 return egpuIncompatible;
640 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
642 int i, ndev, checkres, retval;
645 gmx_device_info_t *devs;
650 gpu_info->n_dev_compatible = 0;
655 stat = cudaGetDeviceCount(&ndev);
656 if (stat != cudaSuccess)
660 /* cudaGetDeviceCount failed which means that there is something
661 * wrong with the machine: driver-runtime mismatch, all GPUs being
662 * busy in exclusive mode, or some other condition which should
663 * result in us issuing a warning a falling back to CPUs. */
665 s = cudaGetErrorString(stat);
666 strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
671 for (i = 0; i < ndev; i++)
673 checkres = is_gmx_supported_gpu_id(i, &prop);
677 devs[i].stat = checkres;
679 if (checkres == egpuCompatible)
681 gpu_info->n_dev_compatible++;
687 gpu_info->n_dev = ndev;
688 gpu_info->gpu_dev = devs;
693 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
695 // Possible minor over-allocation here, but not important for anything
696 std::vector<int> compatibleGpus;
697 compatibleGpus.reserve(gpu_info.n_dev);
698 for (int i = 0; i < gpu_info.n_dev; i++)
700 assert(gpu_info.gpu_dev);
701 if (gpu_info.gpu_dev[i].stat == egpuCompatible)
703 compatibleGpus.push_back(i);
706 return compatibleGpus;
709 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &gpu_info,
712 return (index >= gpu_info.n_dev ?
713 gpu_detect_res_str[egpuNonexistent] :
714 gpu_detect_res_str[gpu_info.gpu_dev[index].stat]);
717 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
719 if (gpu_info == NULL)
724 sfree(gpu_info->gpu_dev);
727 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
731 if (index < 0 && index >= gpu_info.n_dev)
736 gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
739 dinfo->stat == egpuCompatible ||
740 dinfo->stat == egpuIncompatible;
744 sprintf(s, "#%d: %s, stat: %s",
746 gpu_detect_res_str[dinfo->stat]);
750 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
751 dinfo->id, dinfo->prop.name,
752 dinfo->prop.major, dinfo->prop.minor,
753 dinfo->prop.ECCEnabled ? "yes" : " no",
754 gpu_detect_res_str[dinfo->stat]);
758 int get_current_cuda_gpu_device_id(void)
761 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
766 size_t sizeof_gpu_dev_info(void)
768 return sizeof(gmx_device_info_t);
771 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
772 gmx_host_alloc_t **nb_alloc,
773 gmx_host_free_t **nb_free)
777 *nb_alloc = &pmalloc;
787 void startGpuProfiler(void)
789 /* The NVPROF_ID environment variable is set by nvprof and indicates that
790 mdrun is executed in the CUDA profiler.
791 If nvprof was run is with "--profile-from-start off", the profiler will
792 be started here. This way we can avoid tracing the CUDA events from the
793 first part of the run. Starting the profiler again does nothing.
798 stat = cudaProfilerStart();
799 CU_RET_ERR(stat, "cudaProfilerStart failed");
803 void stopGpuProfiler(void)
805 /* Stopping the nvidia here allows us to eliminate the subsequent
806 API calls from the trace, e.g. uninitialization and cleanup. */
810 stat = cudaProfilerStop();
811 CU_RET_ERR(stat, "cudaProfilerStop failed");
815 void resetGpuProfiler(void)
817 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
818 * the profiling here (can't stop it) which will achieve the desired effect if
819 * the run was started with the profiling disabled.
821 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.