2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define functions for detection and initialization for CUDA devices.
38 * \author Szilard Pall <pall.szilard@gmail.com>
43 #include "gpu_utils.h"
51 #include <cuda_profiler_api.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
55 #include "gromacs/hardware/gpu_hw_info.h"
56 #include "gromacs/utility/basedefinitions.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/logger.h"
59 #include "gromacs/utility/smalloc.h"
63 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
65 #define HAVE_NVML_APPLICATION_CLOCKS 0
66 #endif /* HAVE_NVML */
68 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
69 /*! Check for NVML error on the return status of a NVML API call. */
70 # define HANDLE_NVML_RET_ERR(status, msg) \
72 if (status != NVML_SUCCESS) \
74 gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
77 #else /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
78 # define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
79 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
81 #if HAVE_NVML_APPLICATION_CLOCKS
82 static const gmx_bool bCompiledWithApplicationClockSupport = true;
84 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
88 * Max number of devices supported by CUDA (for consistency checking).
90 * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
92 static int cuda_max_device_count = 32;
94 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
96 /** Dummy kernel used for sanity checking. */
97 __global__ void k_dummy_test()
103 * \brief Runs GPU sanity checks.
105 * Runs a series of checks to determine that the given GPU and underlying CUDA
106 * driver/runtime functions properly.
107 * Returns properties of a device with given ID or the one that has
108 * already been initialized earlier in the case if of \dev_id == -1.
110 * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
111 * \param[out] dev_prop pointer to the structure in which the device properties will be returned
112 * \returns 0 if the device looks OK
114 * TODO: introduce errors codes and handle errors more smoothly.
116 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
121 cu_err = cudaGetDeviceCount(&dev_count);
122 if (cu_err != cudaSuccess)
124 fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
125 cudaGetErrorString(cu_err));
129 /* no CUDA compatible device at all */
135 /* things might go horribly wrong if cudart is not compatible with the driver */
136 if (dev_count < 0 || dev_count > cuda_max_device_count)
141 if (dev_id == -1) /* device already selected let's not destroy the context */
143 cu_err = cudaGetDevice(&id);
144 if (cu_err != cudaSuccess)
146 fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
147 cudaGetErrorString(cu_err));
154 if (id > dev_count - 1) /* pfff there's no such device */
156 fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
162 memset(dev_prop, 0, sizeof(cudaDeviceProp));
163 cu_err = cudaGetDeviceProperties(dev_prop, id);
164 if (cu_err != cudaSuccess)
166 fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
167 cudaGetErrorString(cu_err));
171 /* both major & minor is 9999 if no CUDA capable devices are present */
172 if (dev_prop->major == 9999 && dev_prop->minor == 9999)
176 /* we don't care about emulation mode */
177 if (dev_prop->major == 0)
184 cu_err = cudaSetDevice(id);
185 if (cu_err != cudaSuccess)
187 fprintf(stderr, "Error %d while switching to device #%d: %s\n",
188 cu_err, id, cudaGetErrorString(cu_err));
193 /* try to execute a dummy kernel */
194 k_dummy_test<<< 1, 512>>> ();
195 if (cudaThreadSynchronize() != cudaSuccess)
200 /* destroy context if we created one */
203 cu_err = cudaDeviceReset();
204 CU_RET_ERR(cu_err, "cudaDeviceReset failed");
210 #if HAVE_NVML_APPLICATION_CLOCKS
211 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
213 * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
214 * matching PCI-E information from \cuda_dev with the available NVML devices.
216 * \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
217 * \returns true if \cuda_dev could be enriched with matching NVML device information.
219 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
221 nvmlDevice_t nvml_device_id;
222 unsigned int nvml_device_count = 0;
223 nvmlReturn_t nvml_stat = nvmlDeviceGetCount ( &nvml_device_count );
224 bool nvmlWasInitialized = false;
225 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
226 for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
228 nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
229 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
230 if (nvml_stat != NVML_SUCCESS)
235 nvmlPciInfo_t nvml_pci_info;
236 nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
237 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
238 if (nvml_stat != NVML_SUCCESS)
242 if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
243 static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
244 static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
246 nvmlWasInitialized = true;
247 cuda_dev->nvml_device_id = nvml_device_id;
251 return nvmlWasInitialized;
254 /*! \brief Reads and returns the application clocks for device.
256 * \param[in] device The GPU device
257 * \param[out] app_sm_clock The current application SM clock
258 * \param[out] app_mem_clock The current application memory clock
259 * \returns if applacation clocks are supported
261 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
262 unsigned int *app_sm_clock,
263 unsigned int *app_mem_clock)
265 nvmlReturn_t nvml_stat;
267 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
268 if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
272 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
273 nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
274 HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
278 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
280 /*! \brief Tries to set application clocks for the GPU with the given index.
282 * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
283 * to handle the application clocks for. Application clocks are set to the
284 * max supported value to increase performance if application clock permissions
285 * allow this. For future GPU architectures a more sophisticated scheme might be
288 * \todo Refactor this into a detection phase and a work phase. Also
289 * refactor to remove compile-time dependence on logging header.
291 * \param mdlog log file to write to
292 * \param[in] gpuid index of the GPU to set application clocks for
293 * \param[in] gpu_info GPU info of all detected devices in the system.
294 * \returns true if no error occurs during application clocks handling.
296 static gmx_bool init_gpu_application_clocks(
297 const gmx::MDLogger &mdlog, int gmx_unused gpuid,
298 const gmx_gpu_info_t gmx_unused *gpu_info)
300 const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop;
301 int cuda_compute_capability = prop->major * 10 + prop->minor;
302 gmx_bool bGpuCanUseApplicationClocks =
303 ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
304 (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
305 if (!bGpuCanUseApplicationClocks)
310 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
311 "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
312 " application clocks of the detected %s GPU to improve performance.\n"
313 " Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
317 if (!bCompiledWithApplicationClockSupport)
319 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
320 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
321 " managing application clocks of the detected %s GPU to improve performance.\n"
322 " If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
327 /* We've compiled with NVML application clocks support, and have a GPU that can use it */
328 nvmlReturn_t nvml_stat = NVML_SUCCESS;
330 //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
331 // this variable can be later used to give a user more fine grained control.
332 env = getenv("GMX_GPU_APPLICATION_CLOCKS");
333 if (env != NULL && ( strcmp( env, "0") == 0 ||
334 gmx_strcasecmp( env, "OFF") == 0 ||
335 gmx_strcasecmp( env, "DISABLE") == 0 ))
339 nvml_stat = nvmlInit();
340 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
341 if (nvml_stat != NVML_SUCCESS)
346 gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
348 if (!addNVMLDeviceId(cuda_dev))
352 //get current application clocks setting
353 if (!getApplicationClocks(cuda_dev,
354 &cuda_dev->nvml_orig_app_sm_clock,
355 &cuda_dev->nvml_orig_app_mem_clock))
359 //get max application clocks
360 unsigned int max_sm_clock = 0;
361 unsigned int max_mem_clock = 0;
362 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
363 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
364 nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
365 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
367 cuda_dev->nvml_is_restricted = NVML_FEATURE_ENABLED;
368 cuda_dev->nvml_app_clocks_changed = false;
370 if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
372 //TODO: This should probably be integrated into the GPU Properties table.
373 GMX_LOG(mdlog.info).appendTextFormatted(
374 "Application clocks (GPU clocks) for %s are (%d,%d)",
375 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
379 if (cuda_compute_capability >= 60)
381 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
382 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
383 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
387 nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
388 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
390 if (nvml_stat != NVML_SUCCESS)
392 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
393 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
394 nvml_stat, nvmlErrorString(nvml_stat));
398 if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
400 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
401 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
402 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
406 /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
407 e.g. if max application clocks should not be used for certain GPUs. */
408 GMX_LOG(mdlog.warning).appendTextFormatted(
409 "Changing GPU application clocks for %s to (%d,%d)",
410 cuda_dev->prop.name, max_mem_clock, max_sm_clock);
411 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
412 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
413 cuda_dev->nvml_app_clocks_changed = true;
414 cuda_dev->nvml_set_app_sm_clock = max_sm_clock;
415 cuda_dev->nvml_set_app_mem_clock = max_mem_clock;
418 #endif /* HAVE_NVML */
421 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
423 * \param[in] gpu_dev CUDA device information
425 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
427 #if !HAVE_NVML_APPLICATION_CLOCKS
428 GMX_UNUSED_VALUE(cuda_dev);
430 #else /* HAVE_NVML_APPLICATION_CLOCKS */
431 nvmlReturn_t nvml_stat = NVML_SUCCESS;
433 cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
434 cuda_dev->nvml_app_clocks_changed)
436 /* Check if the clocks are still what we set them to.
437 * If so, set them back to the state we originally found them in.
438 * If not, don't touch them, because something else set them later.
440 unsigned int app_sm_clock, app_mem_clock;
441 getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
442 if (app_sm_clock == cuda_dev->nvml_set_app_sm_clock &&
443 app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
445 nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
446 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
449 nvml_stat = nvmlShutdown();
450 HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
451 return (nvml_stat == NVML_SUCCESS);
452 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
455 gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
456 const struct gmx_gpu_info_t *gpu_info,
457 const struct gmx_gpu_opt_t *gpu_opt)
466 if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
468 sprintf(sbuf, "Trying to initialize an non-existent GPU: "
469 "there are %d selected GPU(s), but #%d was requested.",
470 gpu_opt->n_dev_use, mygpu);
474 gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
476 stat = cudaSetDevice(gpuid);
477 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
481 fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
484 //Ignoring return value as NVML errors should be treated not critical.
485 if (stat == cudaSuccess)
487 init_gpu_application_clocks(mdlog, gpuid, gpu_info);
489 return (stat == cudaSuccess);
492 gmx_bool free_cuda_gpu(
493 int gmx_unused mygpu, char *result_str,
494 const gmx_gpu_info_t gmx_unused *gpu_info,
495 const gmx_gpu_opt_t gmx_unused *gpu_opt
499 gmx_bool reset_gpu_application_clocks_status = true;
507 stat = cudaGetDevice(&gpuid);
508 CU_RET_ERR(stat, "cudaGetDevice failed");
509 fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
512 gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
515 reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
518 stat = cudaDeviceReset();
519 strncpy(result_str, cudaGetErrorString(stat), STRLEN);
520 return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
523 /*! \brief Returns true if the gpu characterized by the device properties is
524 * supported by the native gpu acceleration.
526 * \param[in] dev_prop the CUDA device properties of the gpus to test.
527 * \returns true if the GPU properties passed indicate a compatible
528 * GPU, otherwise false.
530 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
532 return (dev_prop->major >= 2);
535 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
537 * Returns a status value which indicates compatibility or one of the following
538 * errors: incompatibility, insistence, or insanity (=unexpected behavior).
539 * It also returns the respective device's properties in \dev_prop (if applicable).
541 * \param[in] dev_id the ID of the GPU to check.
542 * \param[out] dev_prop the CUDA device properties of the device checked.
543 * \returns the status of the requested device
545 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
550 stat = cudaGetDeviceCount(&ndev);
551 if (stat != cudaSuccess)
556 if (dev_id > ndev - 1)
558 return egpuNonexistent;
561 /* TODO: currently we do not make a distinction between the type of errors
562 * that can appear during sanity checks. This needs to be improved, e.g if
563 * the dummy test kernel fails to execute with a "device busy message" we
564 * should appropriately report that the device is busy instead of insane.
566 if (do_sanity_checks(dev_id, dev_prop) == 0)
568 if (is_gmx_supported_gpu(dev_prop))
570 return egpuCompatible;
574 return egpuIncompatible;
584 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
586 int i, ndev, checkres, retval;
589 gmx_device_info_t *devs;
594 gpu_info->n_dev_compatible = 0;
599 stat = cudaGetDeviceCount(&ndev);
600 if (stat != cudaSuccess)
604 /* cudaGetDeviceCount failed which means that there is something
605 * wrong with the machine: driver-runtime mismatch, all GPUs being
606 * busy in exclusive mode, or some other condition which should
607 * result in us issuing a warning a falling back to CPUs. */
609 s = cudaGetErrorString(stat);
610 strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
615 for (i = 0; i < ndev; i++)
617 checkres = is_gmx_supported_gpu_id(i, &prop);
621 devs[i].stat = checkres;
623 if (checkres == egpuCompatible)
625 gpu_info->n_dev_compatible++;
631 gpu_info->n_dev = ndev;
632 gpu_info->gpu_dev = devs;
637 bool isGpuCompatible(const gmx_gpu_info_t *gpu_info,
642 return (index >= gpu_info->n_dev ?
644 gpu_info->gpu_dev[index].stat == egpuCompatible);
647 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t *gpu_info,
652 return (index >= gpu_info->n_dev ?
653 gpu_detect_res_str[egpuNonexistent] :
654 gpu_detect_res_str[gpu_info->gpu_dev[index].stat]);
657 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
659 if (gpu_info == NULL)
664 sfree(gpu_info->gpu_dev);
667 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
672 if (index < 0 && index >= gpu_info->n_dev)
677 gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
680 dinfo->stat == egpuCompatible ||
681 dinfo->stat == egpuIncompatible;
685 sprintf(s, "#%d: %s, stat: %s",
687 gpu_detect_res_str[dinfo->stat]);
691 sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
692 dinfo->id, dinfo->prop.name,
693 dinfo->prop.major, dinfo->prop.minor,
694 dinfo->prop.ECCEnabled ? "yes" : " no",
695 gpu_detect_res_str[dinfo->stat]);
699 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
700 const gmx_gpu_opt_t *gpu_opt,
705 assert(idx >= 0 && idx < gpu_opt->n_dev_use);
707 return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
710 int get_current_cuda_gpu_device_id(void)
713 CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
718 size_t sizeof_gpu_dev_info(void)
720 return sizeof(gmx_device_info_t);
723 void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
724 gmx_host_alloc_t **nb_alloc,
725 gmx_host_free_t **nb_free)
729 *nb_alloc = &pmalloc;
739 void startGpuProfiler(void)
741 /* The NVPROF_ID environment variable is set by nvprof and indicates that
742 mdrun is executed in the CUDA profiler.
743 If nvprof was run is with "--profile-from-start off", the profiler will
744 be started here. This way we can avoid tracing the CUDA events from the
745 first part of the run. Starting the profiler again does nothing.
750 stat = cudaProfilerStart();
751 CU_RET_ERR(stat, "cudaProfilerStart failed");
755 void stopGpuProfiler(void)
757 /* Stopping the nvidia here allows us to eliminate the subsequent
758 API calls from the trace, e.g. uninitialization and cleanup. */
762 stat = cudaProfilerStop();
763 CU_RET_ERR(stat, "cudaProfilerStop failed");
767 void resetGpuProfiler(void)
769 /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
770 * the profiling here (can't stop it) which will achieve the desired effect if
771 * the run was started with the profiling disabled.
773 * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.