src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/logger.h"
  59 #include "gromacs/utility/smalloc.h"
  60
  61 #if HAVE_NVML
  62 #include <nvml.h>
  63 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  64 #else  /* HAVE_NVML */
  65 #define HAVE_NVML_APPLICATION_CLOCKS 0
  66 #endif /* HAVE_NVML */
  67
  68 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  69 /*! Check for NVML error on the return status of a NVML API call. */
  70 #  define HANDLE_NVML_RET_ERR(status, msg) \
  71     do { \
  72         if (status != NVML_SUCCESS) \
  73         { \
  74             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  75         } \
  76     } while (0)
  77 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  78 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  79 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  80
  81 #if HAVE_NVML_APPLICATION_CLOCKS
  82 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  83 #else
  84 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  85 #endif
  86
  87 /*! \internal \brief
  88  * Max number of devices supported by CUDA (for consistency checking).
  89  *
  90  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  91  */
  92 static int  cuda_max_device_count = 32;
  93
  94 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
  95
  96 /** Dummy kernel used for sanity checking. */
  97 __global__ void k_dummy_test()
  98 {
  99 }
 100
 101
 102 /*!
 103  * \brief Runs GPU sanity checks.
 104  *
 105  * Runs a series of checks to determine that the given GPU and underlying CUDA
 106  * driver/runtime functions properly.
 107  * Returns properties of a device with given ID or the one that has
 108  * already been initialized earlier in the case if of \dev_id == -1.
 109  *
 110  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 111  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 112  * \returns                0 if the device looks OK
 113  *
 114  * TODO: introduce errors codes and handle errors more smoothly.
 115  */
 116 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 117 {
 118     cudaError_t cu_err;
 119     int         dev_count, id;
 120
 121     cu_err = cudaGetDeviceCount(&dev_count);
 122     if (cu_err != cudaSuccess)
 123     {
 124         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 125                 cudaGetErrorString(cu_err));
 126         return -1;
 127     }
 128
 129     /* no CUDA compatible device at all */
 130     if (dev_count == 0)
 131     {
 132         return -1;
 133     }
 134
 135     /* things might go horribly wrong if cudart is not compatible with the driver */
 136     if (dev_count < 0 || dev_count > cuda_max_device_count)
 137     {
 138         return -1;
 139     }
 140
 141     if (dev_id == -1) /* device already selected let's not destroy the context */
 142     {
 143         cu_err = cudaGetDevice(&id);
 144         if (cu_err != cudaSuccess)
 145         {
 146             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 147                     cudaGetErrorString(cu_err));
 148             return -1;
 149         }
 150     }
 151     else
 152     {
 153         id = dev_id;
 154         if (id > dev_count - 1) /* pfff there's no such device */
 155         {
 156             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 157                     dev_id, dev_count);
 158             return -1;
 159         }
 160     }
 161
 162     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 163     cu_err = cudaGetDeviceProperties(dev_prop, id);
 164     if (cu_err != cudaSuccess)
 165     {
 166         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 167                 cudaGetErrorString(cu_err));
 168         return -1;
 169     }
 170
 171     /* both major & minor is 9999 if no CUDA capable devices are present */
 172     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 173     {
 174         return -1;
 175     }
 176     /* we don't care about emulation mode */
 177     if (dev_prop->major == 0)
 178     {
 179         return -1;
 180     }
 181
 182     if (id != -1)
 183     {
 184         cu_err = cudaSetDevice(id);
 185         if (cu_err != cudaSuccess)
 186         {
 187             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 188                     cu_err, id, cudaGetErrorString(cu_err));
 189             return -1;
 190         }
 191     }
 192
 193     /* try to execute a dummy kernel */
 194     k_dummy_test<<< 1, 512>>> ();
 195     if (cudaThreadSynchronize() != cudaSuccess)
 196     {
 197         return -1;
 198     }
 199
 200     /* destroy context if we created one */
 201     if (id != -1)
 202     {
 203         cu_err = cudaDeviceReset();
 204         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 205     }
 206
 207     return 0;
 208 }
 209
 210 #if HAVE_NVML_APPLICATION_CLOCKS
 211 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 212  *
 213  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 214  * matching PCI-E information from \cuda_dev with the available NVML devices.
 215  *
 216  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 217  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 218  */
 219 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 220 {
 221     nvmlDevice_t nvml_device_id;
 222     unsigned int nvml_device_count  = 0;
 223     nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
 224     bool         nvmlWasInitialized = false;
 225     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 226     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 227     {
 228         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 229         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 230         if (nvml_stat != NVML_SUCCESS)
 231         {
 232             break;
 233         }
 234
 235         nvmlPciInfo_t nvml_pci_info;
 236         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 237         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 238         if (nvml_stat != NVML_SUCCESS)
 239         {
 240             break;
 241         }
 242         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 243             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 244             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 245         {
 246             nvmlWasInitialized         = true;
 247             cuda_dev->nvml_device_id   = nvml_device_id;
 248             break;
 249         }
 250     }
 251     return nvmlWasInitialized;
 252 }
 253
 254 /*! \brief Reads and returns the application clocks for device.
 255  *
 256  * \param[in]  device        The GPU device
 257  * \param[out] app_sm_clock  The current application SM clock
 258  * \param[out] app_mem_clock The current application memory clock
 259  * \returns if applacation clocks are supported
 260  */
 261 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 262                                  unsigned int            *app_sm_clock,
 263                                  unsigned int            *app_mem_clock)
 264 {
 265     nvmlReturn_t nvml_stat;
 266
 267     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 268     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 269     {
 270         return false;
 271     }
 272     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
 273     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 274     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
 275
 276     return true;
 277 }
 278 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 279
 280 /*! \brief Tries to set application clocks for the GPU with the given index.
 281  *
 282  * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 283  * to handle the application clocks for. Application clocks are set to the
 284  * max supported value to increase performance if application clock permissions
 285  * allow this. For future GPU architectures a more sophisticated scheme might be
 286  * required.
 287  *
 288  * \todo Refactor this into a detection phase and a work phase. Also
 289  * refactor to remove compile-time dependence on logging header.
 290  *
 291  * \param     mdlog         log file to write to
 292  * \param[in] gpuid         index of the GPU to set application clocks for
 293  * \param[in] gpu_info      GPU info of all detected devices in the system.
 294  * \returns                 true if no error occurs during application clocks handling.
 295  */
 296 static gmx_bool init_gpu_application_clocks(
 297         const gmx::MDLogger &mdlog, int gmx_unused gpuid,
 298         const gmx_gpu_info_t gmx_unused *gpu_info)
 299 {
 300     const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 301     int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
 302     gmx_bool              bGpuCanUseApplicationClocks =
 303         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
 304          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
 305     if (!bGpuCanUseApplicationClocks)
 306     {
 307         return true;
 308     }
 309 #if !HAVE_NVML
 310     GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 311             "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 312             "      application clocks of the detected %s GPU to improve performance.\n"
 313             "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
 314             prop->name);
 315     return true;
 316 #else
 317     if (!bCompiledWithApplicationClockSupport)
 318     {
 319         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 320                 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 321                 "      managing application clocks of the detected %s GPU to improve performance.\n"
 322                 "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
 323                 prop->name );
 324         return true;
 325     }
 326
 327     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 328     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 329     char        *env;
 330     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 331     //      this variable can be later used to give a user more fine grained control.
 332     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 333     if (env != NULL && ( strcmp( env, "0") == 0 ||
 334                          gmx_strcasecmp( env, "OFF") == 0 ||
 335                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 336     {
 337         return true;
 338     }
 339     nvml_stat = nvmlInit();
 340     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 341     if (nvml_stat != NVML_SUCCESS)
 342     {
 343         return false;
 344     }
 345
 346     gmx_device_info_t *cuda_dev = &(gpu_info->gpu_dev[gpuid]);
 347
 348     if (!addNVMLDeviceId(cuda_dev))
 349     {
 350         return false;
 351     }
 352     //get current application clocks setting
 353     if (!getApplicationClocks(cuda_dev,
 354                               &cuda_dev->nvml_orig_app_sm_clock,
 355                               &cuda_dev->nvml_orig_app_mem_clock))
 356     {
 357         return false;
 358     }
 359     //get max application clocks
 360     unsigned int max_sm_clock  = 0;
 361     unsigned int max_mem_clock = 0;
 362     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 363     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 364     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 365     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 366
 367     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 368     cuda_dev->nvml_app_clocks_changed = false;
 369
 370     if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
 371     {
 372         //TODO: This should probably be integrated into the GPU Properties table.
 373         GMX_LOG(mdlog.info).appendTextFormatted(
 374                 "Application clocks (GPU clocks) for %s are (%d,%d)",
 375                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 376         return true;
 377     }
 378
 379     if (cuda_compute_capability >= 60)
 380     {
 381         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 382                 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
 383                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 384         return true;
 385     }
 386
 387     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 388     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 389
 390     if (nvml_stat != NVML_SUCCESS)
 391     {
 392         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 393                 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
 394                 nvml_stat, nvmlErrorString(nvml_stat));
 395         return false;
 396     }
 397
 398     if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
 399     {
 400         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 401                 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
 402                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 403         return true;
 404     }
 405
 406     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 407        e.g. if max application clocks should not be used for certain GPUs. */
 408     GMX_LOG(mdlog.warning).appendTextFormatted(
 409             "Changing GPU application clocks for %s to (%d,%d)",
 410             cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 411     nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 412     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 413     cuda_dev->nvml_app_clocks_changed = true;
 414     cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 415     cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 416
 417     return true;
 418 #endif /* HAVE_NVML */
 419 }
 420
 421 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 422  *
 423  * \param[in] gpu_dev  CUDA device information
 424  */
 425 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 426 {
 427 #if !HAVE_NVML_APPLICATION_CLOCKS
 428     GMX_UNUSED_VALUE(cuda_dev);
 429     return true;
 430 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 431     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 432     if (cuda_dev &&
 433         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 434         cuda_dev->nvml_app_clocks_changed)
 435     {
 436         /* Check if the clocks are still what we set them to.
 437          * If so, set them back to the state we originally found them in.
 438          * If not, don't touch them, because something else set them later.
 439          */
 440         unsigned int app_sm_clock, app_mem_clock;
 441         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 442         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 443             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 444         {
 445             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 446             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
 447         }
 448     }
 449     nvml_stat = nvmlShutdown();
 450     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 451     return (nvml_stat == NVML_SUCCESS);
 452 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 453 }
 454
 455 gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
 456                   const struct gmx_gpu_info_t *gpu_info,
 457                   const struct gmx_gpu_opt_t *gpu_opt)
 458 {
 459     cudaError_t stat;
 460     char        sbuf[STRLEN];
 461     int         gpuid;
 462
 463     assert(gpu_info);
 464     assert(result_str);
 465
 466     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 467     {
 468         sprintf(sbuf, "Trying to initialize an non-existent GPU: "
 469                 "there are %d selected GPU(s), but #%d was requested.",
 470                 gpu_opt->n_dev_use, mygpu);
 471         gmx_incons(sbuf);
 472     }
 473
 474     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 475
 476     stat = cudaSetDevice(gpuid);
 477     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 478
 479     if (debug)
 480     {
 481         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 482     }
 483
 484     //Ignoring return value as NVML errors should be treated not critical.
 485     if (stat == cudaSuccess)
 486     {
 487         init_gpu_application_clocks(mdlog, gpuid, gpu_info);
 488     }
 489     return (stat == cudaSuccess);
 490 }
 491
 492 gmx_bool free_cuda_gpu(
 493         int gmx_unused mygpu, char *result_str,
 494         const gmx_gpu_info_t gmx_unused *gpu_info,
 495         const gmx_gpu_opt_t gmx_unused *gpu_opt
 496         )
 497 {
 498     cudaError_t  stat;
 499     gmx_bool     reset_gpu_application_clocks_status = true;
 500     int          gpuid;
 501
 502     assert(result_str);
 503
 504     if (debug)
 505     {
 506         int gpuid;
 507         stat = cudaGetDevice(&gpuid);
 508         CU_RET_ERR(stat, "cudaGetDevice failed");
 509         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 510     }
 511
 512     gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 513     if (gpuid != -1)
 514     {
 515         reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 516     }
 517
 518     stat = cudaDeviceReset();
 519     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 520     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 521 }
 522
 523 /*! \brief Returns true if the gpu characterized by the device properties is
 524  *  supported by the native gpu acceleration.
 525  *
 526  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 527  * \returns             true if the GPU properties passed indicate a compatible
 528  *                      GPU, otherwise false.
 529  */
 530 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 531 {
 532     return (dev_prop->major >= 2);
 533 }
 534
 535 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 536  *
 537  *  Returns a status value which indicates compatibility or one of the following
 538  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 539  *  It also returns the respective device's properties in \dev_prop (if applicable).
 540  *
 541  *  \param[in]  dev_id   the ID of the GPU to check.
 542  *  \param[out] dev_prop the CUDA device properties of the device checked.
 543  *  \returns             the status of the requested device
 544  */
 545 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 546 {
 547     cudaError_t stat;
 548     int         ndev;
 549
 550     stat = cudaGetDeviceCount(&ndev);
 551     if (stat != cudaSuccess)
 552     {
 553         return egpuInsane;
 554     }
 555
 556     if (dev_id > ndev - 1)
 557     {
 558         return egpuNonexistent;
 559     }
 560
 561     /* TODO: currently we do not make a distinction between the type of errors
 562      * that can appear during sanity checks. This needs to be improved, e.g if
 563      * the dummy test kernel fails to execute with a "device busy message" we
 564      * should appropriately report that the device is busy instead of insane.
 565      */
 566     if (do_sanity_checks(dev_id, dev_prop) == 0)
 567     {
 568         if (is_gmx_supported_gpu(dev_prop))
 569         {
 570             return egpuCompatible;
 571         }
 572         else
 573         {
 574             return egpuIncompatible;
 575         }
 576     }
 577     else
 578     {
 579         return egpuInsane;
 580     }
 581 }
 582
 583
 584 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 585 {
 586     int                i, ndev, checkres, retval;
 587     cudaError_t        stat;
 588     cudaDeviceProp     prop;
 589     gmx_device_info_t *devs;
 590
 591     assert(gpu_info);
 592     assert(err_str);
 593
 594     gpu_info->n_dev_compatible = 0;
 595
 596     ndev    = 0;
 597     devs    = NULL;
 598
 599     stat = cudaGetDeviceCount(&ndev);
 600     if (stat != cudaSuccess)
 601     {
 602         const char *s;
 603
 604         /* cudaGetDeviceCount failed which means that there is something
 605          * wrong with the machine: driver-runtime mismatch, all GPUs being
 606          * busy in exclusive mode, or some other condition which should
 607          * result in us issuing a warning a falling back to CPUs. */
 608         retval = -1;
 609         s      = cudaGetErrorString(stat);
 610         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 611     }
 612     else
 613     {
 614         snew(devs, ndev);
 615         for (i = 0; i < ndev; i++)
 616         {
 617             checkres = is_gmx_supported_gpu_id(i, &prop);
 618
 619             devs[i].id   = i;
 620             devs[i].prop = prop;
 621             devs[i].stat = checkres;
 622
 623             if (checkres == egpuCompatible)
 624             {
 625                 gpu_info->n_dev_compatible++;
 626             }
 627         }
 628         retval = 0;
 629     }
 630
 631     gpu_info->n_dev   = ndev;
 632     gpu_info->gpu_dev = devs;
 633
 634     return retval;
 635 }
 636
 637 bool isGpuCompatible(const gmx_gpu_info_t *gpu_info,
 638                      int                   index)
 639 {
 640     assert(gpu_info);
 641
 642     return (index >= gpu_info->n_dev ?
 643             false :
 644             gpu_info->gpu_dev[index].stat == egpuCompatible);
 645 }
 646
 647 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t *gpu_info,
 648                                            int                   index)
 649 {
 650     assert(gpu_info);
 651
 652     return (index >= gpu_info->n_dev ?
 653             gpu_detect_res_str[egpuNonexistent] :
 654             gpu_detect_res_str[gpu_info->gpu_dev[index].stat]);
 655 }
 656
 657 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 658 {
 659     if (gpu_info == NULL)
 660     {
 661         return;
 662     }
 663
 664     sfree(gpu_info->gpu_dev);
 665 }
 666
 667 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 668 {
 669     assert(s);
 670     assert(gpu_info);
 671
 672     if (index < 0 && index >= gpu_info->n_dev)
 673     {
 674         return;
 675     }
 676
 677     gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 678
 679     bool               bGpuExists =
 680         dinfo->stat == egpuCompatible ||
 681         dinfo->stat == egpuIncompatible;
 682
 683     if (!bGpuExists)
 684     {
 685         sprintf(s, "#%d: %s, stat: %s",
 686                 dinfo->id, "N/A",
 687                 gpu_detect_res_str[dinfo->stat]);
 688     }
 689     else
 690     {
 691         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 692                 dinfo->id, dinfo->prop.name,
 693                 dinfo->prop.major, dinfo->prop.minor,
 694                 dinfo->prop.ECCEnabled ? "yes" : " no",
 695                 gpu_detect_res_str[dinfo->stat]);
 696     }
 697 }
 698
 699 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 700                       const gmx_gpu_opt_t  *gpu_opt,
 701                       int                   idx)
 702 {
 703     assert(gpu_info);
 704     assert(gpu_opt);
 705     assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 706
 707     return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 708 }
 709
 710 int get_current_cuda_gpu_device_id(void)
 711 {
 712     int gpuid;
 713     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 714
 715     return gpuid;
 716 }
 717
 718 size_t sizeof_gpu_dev_info(void)
 719 {
 720     return sizeof(gmx_device_info_t);
 721 }
 722
 723 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 724                                   gmx_host_alloc_t **nb_alloc,
 725                                   gmx_host_free_t  **nb_free)
 726 {
 727     if (bUseGpuKernels)
 728     {
 729         *nb_alloc = &pmalloc;
 730         *nb_free  = &pfree;
 731     }
 732     else
 733     {
 734         *nb_alloc = NULL;
 735         *nb_free  = NULL;
 736     }
 737 }
 738
 739 void startGpuProfiler(void)
 740 {
 741     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 742        mdrun is executed in the CUDA profiler.
 743        If nvprof was run is with "--profile-from-start off", the profiler will
 744        be started here. This way we can avoid tracing the CUDA events from the
 745        first part of the run. Starting the profiler again does nothing.
 746      */
 747     if (cudaProfilerRun)
 748     {
 749         cudaError_t stat;
 750         stat = cudaProfilerStart();
 751         CU_RET_ERR(stat, "cudaProfilerStart failed");
 752     }
 753 }
 754
 755 void stopGpuProfiler(void)
 756 {
 757     /* Stopping the nvidia here allows us to eliminate the subsequent
 758        API calls from the trace, e.g. uninitialization and cleanup. */
 759     if (cudaProfilerRun)
 760     {
 761         cudaError_t stat;
 762         stat = cudaProfilerStop();
 763         CU_RET_ERR(stat, "cudaProfilerStop failed");
 764     }
 765 }
 766
 767 void resetGpuProfiler(void)
 768 {
 769     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 770      *  the profiling here (can't stop it) which will achieve the desired effect if
 771      *  the run was started with the profiling disabled.
 772      *
 773      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 774      * stopGpuProfiler();
 775      */
 776     if (cudaProfilerRun)
 777     {
 778         startGpuProfiler();
 779     }
 780 }