2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
51 #include <cuda_profiler_api.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
55 #include "gromacs/hardware/gpu_hw_info.h"
56 #include "gromacs/utility/basedefinitions.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/smalloc.h"
62 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
64 #define HAVE_NVML_APPLICATION_CLOCKS 0
65 #endif /* HAVE_NVML */
67 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
68 /*! Check for NVML error on the return status of a NVML API call. */
69 # define HANDLE_NVML_RET_ERR(status, msg) \
71 if (status != NVML_SUCCESS) \
73 gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
76 #else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
77 # define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
78 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
80 #if HAVE_NVML_APPLICATION_CLOCKS
81 static const gmx_bool bCompiledWithApplicationClockSupport = true;
83 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
87 * Max number of devices supported by CUDA (for consistency checking).
89 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
91 static int cuda_max_device_count = 32;
93 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
95 /** Dummy kernel used for sanity checking. */
96 __global__ void k_dummy_test()
102 * \brief Runs GPU sanity checks.
104 * Runs a series of checks to determine that the given GPU and underlying CUDA
105 * driver/runtime functions properly.
106 * Returns properties of a device with given ID or the one that has
107 * already been initialized earlier in the case if of \dev_id == -1.
109 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
110 * \param[out] dev_prop pointer to the structure in which the device properties will be returned
111 * \returns 0 if the device looks OK
113 * TODO: introduce errors codes and handle errors more smoothly.
115 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
120 cu_err = cudaGetDeviceCount(&dev_count);
121 if (cu_err != cudaSuccess)
123 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
124 cudaGetErrorString(cu_err));
128 /* no CUDA compatible device at all */
134 /* things might go horribly wrong if cudart is not compatible with the driver */
135 if (dev_count < 0 || dev_count > cuda_max_device_count)
140 if (dev_id == -1) /* device already selected let's not destroy the context */
142 cu_err = cudaGetDevice(&id);
143 if (cu_err != cudaSuccess)
145 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
146 cudaGetErrorString(cu_err));
153 if (id > dev_count - 1) /* pfff there's no such device */
155 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
161 memset(dev_prop, 0, sizeof(cudaDeviceProp));
162 cu_err = cudaGetDeviceProperties(dev_prop, id);
163 if (cu_err != cudaSuccess)
165 fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
166 cudaGetErrorString(cu_err));
170 /* both major & minor is 9999 if no CUDA capable devices are present */
171 if (dev_prop->major == 9999 && dev_prop->minor == 9999)
175 /* we don't care about emulation mode */
176 if (dev_prop->major == 0)
183 cu_err = cudaSetDevice(id);
184 if (cu_err != cudaSuccess)
186 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
187 cu_err, id, cudaGetErrorString(cu_err));
192 /* try to execute a dummy kernel */
193 k_dummy_test<<< 1, 512>>> ();
194 if (cudaThreadSynchronize() != cudaSuccess)
199 /* destroy context if we created one */
202 cu_err = cudaDeviceReset();
203 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
210 /* TODO: We should actually be using md_print_warn in md_logging.c,
211 * but we can't include mpi.h in CUDA code.
213 static void md_print_info(FILE *fplog,
214 const char *fmt, ...)
220 /* We should only print to stderr on the master node,
221 * in most cases fplog is only set on the master node, so this works.
224 vfprintf(stderr, fmt, ap);
228 vfprintf(fplog, fmt, ap);
234 /* TODO: We should actually be using md_print_warn in md_logging.c,
235 * but we can't include mpi.h in CUDA code.
236 * This is replicated from nbnxn_cuda_data_mgmt.cu.
238 static void md_print_warn(FILE *fplog,
239 const char *fmt, ...)
245 /* We should only print to stderr on the master node,
246 * in most cases fplog is only set on the master node, so this works.
249 fprintf(stderr, "\n");
250 vfprintf(stderr, fmt, ap);
251 fprintf(stderr, "\n");
255 fprintf(fplog, "\n");
256 vfprintf(fplog, fmt, ap);
257 fprintf(fplog, "\n");
262 #if HAVE_NVML_APPLICATION_CLOCKS
263 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
265 * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
266 * matching PCI-E information from \cuda_dev with the available NVML devices.
268 * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
269 * \returns true if \cuda_dev could be enriched with matching NVML device information.
271 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
273 nvmlDevice_t nvml_device_id;
274 unsigned int nvml_device_count = 0;
275 nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
276 cuda_dev->nvml_initialized = false;
277 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
278 for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
280 nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
281 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
282 if (nvml_stat != NVML_SUCCESS)
287 nvmlPciInfo_t nvml_pci_info;
288 nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
289 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
290 if (nvml_stat != NVML_SUCCESS)
294 if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
295 static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
296 static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
298 cuda_dev->nvml_initialized = true;
299 cuda_dev->nvml_device_id = nvml_device_id;
303 return cuda_dev->nvml_initialized;
305 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
307 /*! \brief Tries to set application clocks for the GPU with the given index.
309 * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
310 * to handle the application clocks for. Application clocks are set to the
311 * max supported value to increase performance if application clock permissions
312 * allow this. For future GPU architectures a more sophisticated scheme might be
315 * \param[out] fplog log file to write to
316 * \param[in] gpuid index of the GPU to set application clocks for
317 * \param[in] gpu_info GPU info of all detected devices in the system.
318 * \returns true if no error occurs during application clocks handling.
320 static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
322 const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop;
323 int cuda_version_number = prop->major * 10 + prop->minor;
324 gmx_bool bGpuCanUseApplicationClocks =
325 ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
326 (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
327 if (!bGpuCanUseApplicationClocks)
333 int cuda_runtime = 0;
334 cudaDriverGetVersion(&cuda_driver);
335 cudaRuntimeGetVersion(&cuda_runtime);
336 md_print_warn( fplog, "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
337 " application clocks of the detected %s GPU to improve performance.\n"
338 " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.\n",
342 if (!bCompiledWithApplicationClockSupport)
345 int cuda_runtime = 0;
346 cudaDriverGetVersion(&cuda_driver);
347 cudaRuntimeGetVersion(&cuda_runtime);
348 md_print_warn( fplog, "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
349 " managing application clocks of the detected %s GPU to improve performance.\n"
350 " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.\n",
355 /* We've compiled with NVML application clocks support, and have a GPU that can use it */
356 nvmlReturn_t nvml_stat = NVML_SUCCESS;
358 //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
359 // this variable can be later used to give a user more fine grained control.
360 env = getenv("GMX_GPU_APPLICATION_CLOCKS");
361 if (env != NULL && ( strcmp( env, "0") == 0 ||
362 gmx_strcasecmp( env, "OFF") == 0 ||
363 gmx_strcasecmp( env, "DISABLE") == 0 ))
367 nvml_stat = nvmlInit();
368 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
369 if (nvml_stat != NVML_SUCCESS)
373 if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
377 //get current application clocks setting
378 unsigned int app_sm_clock = 0;
379 unsigned int app_mem_clock = 0;
380 nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
381 if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
385 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
386 nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
387 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
388 //get max application clocks
389 unsigned int max_sm_clock = 0;
390 unsigned int max_mem_clock = 0;
391 nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
392 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
393 nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
394 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
396 gpu_info->gpu_dev[gpuid].nvml_is_restricted = NVML_FEATURE_ENABLED;
397 gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
399 nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
400 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
402 /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
403 e.g. if max application clocks should not be used for certain GPUs. */
404 if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
406 md_print_info( fplog, "Changing GPU application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
407 nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
408 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
409 gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
411 else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
413 md_print_warn( fplog, "Can not change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
415 else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
417 //TODO: This should probably be integrated into the GPU Properties table.
418 md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
422 md_print_warn( fplog, "Can not change GPU application clocks to optimal values due to NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
424 return (nvml_stat == NVML_SUCCESS);
425 #endif /* HAVE_NVML */
428 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
430 * \param[in] gpu_dev CUDA device information
432 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
434 #if !HAVE_NVML_APPLICATION_CLOCKS
435 GMX_UNUSED_VALUE(cuda_dev);
437 #else /* HAVE_NVML_APPLICATION_CLOCKS */
438 nvmlReturn_t nvml_stat = NVML_SUCCESS;
440 cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
441 cuda_dev->nvml_ap_clocks_changed)
443 nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
444 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
446 nvml_stat = nvmlShutdown();
447 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
448 return (nvml_stat == NVML_SUCCESS);
449 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
452 gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
453 const struct gmx_gpu_info_t *gpu_info,
454 const struct gmx_gpu_opt_t *gpu_opt)
463 if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
465 sprintf(sbuf, "Trying to initialize an inexistent GPU: "
466 "there are %d %s-selected GPU(s), but #%d was requested.",
467 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
471 gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
473 stat = cudaSetDevice(gpuid);
474 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
478 fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
481 //Ignoring return value as NVML errors should be treated not critical.
482 if (stat == cudaSuccess)
484 init_gpu_application_clocks(fplog, gpuid, gpu_info);
486 return (stat == cudaSuccess);
489 gmx_bool free_cuda_gpu(
490 int gmx_unused mygpu, char *result_str,
491 const gmx_gpu_info_t gmx_unused *gpu_info,
492 const gmx_gpu_opt_t gmx_unused *gpu_opt
496 gmx_bool reset_gpu_application_clocks_status = true;
504 stat = cudaGetDevice(&gpuid);
505 CU_RET_ERR(stat, "cudaGetDevice failed");
506 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
509 gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
512 reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
515 stat = cudaDeviceReset();
516 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
517 return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
520 /*! \brief Returns true if the gpu characterized by the device properties is
521 * supported by the native gpu acceleration.
523 * \param[in] dev_prop the CUDA device properties of the gpus to test.
524 * \returns true if the GPU properties passed indicate a compatible
525 * GPU, otherwise false.
527 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
529 return (dev_prop->major >= 2);
532 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
534 * \param[in] stat GPU status.
535 * \returns true if the provided status is egpuCompatible, otherwise false.
537 static bool is_compatible_gpu(int stat)
539 return (stat == egpuCompatible);
542 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
544 * Returns a status value which indicates compatibility or one of the following
545 * errors: incompatibility, insistence, or insanity (=unexpected behavior).
546 * It also returns the respective device's properties in \dev_prop (if applicable).
548 * \param[in] dev_id the ID of the GPU to check.
549 * \param[out] dev_prop the CUDA device properties of the device checked.
550 * \returns the status of the requested device
552 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
557 stat = cudaGetDeviceCount(&ndev);
558 if (stat != cudaSuccess)
563 if (dev_id > ndev - 1)
565 return egpuNonexistent;
568 /* TODO: currently we do not make a distinction between the type of errors
569 * that can appear during sanity checks. This needs to be improved, e.g if
570 * the dummy test kernel fails to execute with a "device busy message" we
571 * should appropriately report that the device is busy instead of insane.
573 if (do_sanity_checks(dev_id, dev_prop) == 0)
575 if (is_gmx_supported_gpu(dev_prop))
577 return egpuCompatible;
581 return egpuIncompatible;
591 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
593 int i, ndev, checkres, retval;
596 gmx_device_info_t *devs;
601 gpu_info->n_dev_compatible = 0;
606 stat = cudaGetDeviceCount(&ndev);
607 if (stat != cudaSuccess)
611 /* cudaGetDeviceCount failed which means that there is something
612 * wrong with the machine: driver-runtime mismatch, all GPUs being
613 * busy in exclusive mode, or some other condition which should
614 * result in us issuing a warning a falling back to CPUs. */
616 s = cudaGetErrorString(stat);
617 strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
622 for (i = 0; i < ndev; i++)
624 checkres = is_gmx_supported_gpu_id(i, &prop);
628 devs[i].stat = checkres;
630 if (checkres == egpuCompatible)
632 gpu_info->n_dev_compatible++;
638 gpu_info->n_dev = ndev;
639 gpu_info->gpu_dev = devs;
644 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
645 gmx_gpu_opt_t *gpu_opt)
651 /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
652 assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
654 snew(compat, gpu_info->n_dev);
656 for (i = 0; i < gpu_info->n_dev; i++)
658 if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
661 compat[ncompat - 1] = i;
665 gpu_opt->n_dev_compatible = ncompat;
666 snew(gpu_opt->dev_compatible, ncompat);
667 memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
671 gmx_bool check_selected_gpus(int *checkres,
672 const gmx_gpu_info_t *gpu_info,
673 gmx_gpu_opt_t *gpu_opt)
680 assert(gpu_opt->n_dev_use >= 0);
682 if (gpu_opt->n_dev_use == 0)
687 assert(gpu_opt->dev_use);
689 /* we will assume that all GPUs requested are valid IDs,
690 otherwise we'll bail anyways */
693 for (i = 0; i < gpu_opt->n_dev_use; i++)
695 id = gpu_opt->dev_use[i];
697 /* devices are stored in increasing order of IDs in gpu_dev */
698 gpu_opt->dev_use[i] = id;
700 checkres[i] = (id >= gpu_info->n_dev) ?
701 egpuNonexistent : gpu_info->gpu_dev[id].stat;
703 bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
709 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
711 if (gpu_info == NULL)
716 sfree(gpu_info->gpu_dev);
719 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
724 if (index < 0 && index >= gpu_info->n_dev)
729 gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
732 dinfo->stat == egpuCompatible ||
733 dinfo->stat == egpuIncompatible;
737 sprintf(s, "#%d: %s, stat: %s",
739 gpu_detect_res_str[dinfo->stat]);
743 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
744 dinfo->id, dinfo->prop.name,
745 dinfo->prop.major, dinfo->prop.minor,
746 dinfo->prop.ECCEnabled ? "yes" : " no",
747 gpu_detect_res_str[dinfo->stat]);
751 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
752 const gmx_gpu_opt_t *gpu_opt,
757 assert(idx >= 0 && idx < gpu_opt->n_dev_use);
759 return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
762 int get_current_cuda_gpu_device_id(void)
765 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
770 size_t sizeof_gpu_dev_info(void)
772 return sizeof(gmx_device_info_t);
775 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
776 gmx_host_alloc_t **nb_alloc,
777 gmx_host_free_t **nb_free)
781 *nb_alloc = &pmalloc;
791 void startGpuProfiler(void)
793 /* The NVPROF_ID environment variable is set by nvprof and indicates that
794 mdrun is executed in the CUDA profiler.
795 If nvprof was run is with "--profile-from-start off", the profiler will
796 be started here. This way we can avoid tracing the CUDA events from the
797 first part of the run. Starting the profiler again does nothing.
802 stat = cudaProfilerStart();
803 CU_RET_ERR(stat, "cudaProfilerStart failed");
807 void stopGpuProfiler(void)
809 /* Stopping the nvidia here allows us to eliminate the subsequent
810 API calls from the trace, e.g. uninitialization and cleanup. */
814 stat = cudaProfilerStop();
815 CU_RET_ERR(stat, "cudaProfilerStop failed");
819 void resetGpuProfiler(void)
821 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
822 * the profiling here (can't stop it) which will achieve the desired effect if
823 * the run was started with the profiling disabled.
825 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.