src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/gmxassert.h"
  60 #include "gromacs/utility/logger.h"
  61 #include "gromacs/utility/programcontext.h"
  62 #include "gromacs/utility/smalloc.h"
  63 #include "gromacs/utility/snprintf.h"
  64 #include "gromacs/utility/stringutil.h"
  65
  66 #if HAVE_NVML
  67 #include <nvml.h>
  68 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  69 #else  /* HAVE_NVML */
  70 #define HAVE_NVML_APPLICATION_CLOCKS 0
  71 #endif /* HAVE_NVML */
  72
  73 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  74 /*! Check for NVML error on the return status of a NVML API call. */
  75 #  define HANDLE_NVML_RET_ERR(status, msg) \
  76     do { \
  77         if (status != NVML_SUCCESS) \
  78         { \
  79             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  80         } \
  81     } while (0)
  82 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  83 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  84 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  85
  86 #if HAVE_NVML_APPLICATION_CLOCKS
  87 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  88 #else
  89 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  90 #endif
  91
  92 /*! \internal \brief
  93  * Max number of devices supported by CUDA (for consistency checking).
  94  *
  95  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  96  */
  97 static int  cuda_max_device_count = 32;
  98
  99 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
 100
 101 /** Dummy kernel used for sanity checking. */
 102 static __global__ void k_dummy_test(void)
 103 {
 104 }
 105
 106 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
 107 {
 108     assert(devInfo);
 109
 110     cudaFuncAttributes attributes;
 111     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
 112
 113     if (cudaErrorInvalidDeviceFunction == stat)
 114     {
 115         gmx_fatal(FARGS,
 116                   "The %s binary was not compiled for the selected GPU "
 117                   "(device ID #%d, compute capability %d.%d).\n"
 118                   "When selecting target GPU architectures with GMX_CUDA_TARGET_SM, "
 119                   "make sure to pass the appropriate architecture(s) corresponding to the "
 120                   "device(s) intended to be used (see in the GPU info listing) or alternatively "
 121                   "pass in GMX_CUDA_TARGET_COMPUTE an appropriate virtual architecture. ",
 122                   gmx::getProgramContext().displayName(), devInfo->id,
 123                   devInfo->prop.major, devInfo->prop.minor);
 124     }
 125
 126     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 127
 128     if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
 129     {
 130         gmx_fatal(FARGS,
 131                   "The GPU device code was compiled at runtime from 2.0 source which is "
 132                   "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
 133                   "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
 134                   devInfo->id,
 135                   devInfo->prop.major, devInfo->prop.minor);
 136     }
 137 }
 138
 139 bool isHostMemoryPinned(void *h_ptr)
 140 {
 141     cudaPointerAttributes memoryAttributes;
 142     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 143
 144     bool                  result = false;
 145     switch (stat)
 146     {
 147         case cudaSuccess:
 148             result = true;
 149             break;
 150
 151         case cudaErrorInvalidValue:
 152             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 153             result = false;
 154             // Reset the last error status
 155             cudaGetLastError();
 156             break;
 157
 158         default:
 159             CU_RET_ERR(stat, "Unexpected CUDA error");
 160     }
 161     return result;
 162 }
 163
 164 /*!
 165  * \brief Runs GPU sanity checks.
 166  *
 167  * Runs a series of checks to determine that the given GPU and underlying CUDA
 168  * driver/runtime functions properly.
 169  * Returns properties of a device with given ID or the one that has
 170  * already been initialized earlier in the case if of \dev_id == -1.
 171  *
 172  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 173  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 174  * \returns                0 if the device looks OK
 175  *
 176  * TODO: introduce errors codes and handle errors more smoothly.
 177  */
 178 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 179 {
 180     cudaError_t cu_err;
 181     int         dev_count, id;
 182
 183     cu_err = cudaGetDeviceCount(&dev_count);
 184     if (cu_err != cudaSuccess)
 185     {
 186         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 187                 cudaGetErrorString(cu_err));
 188         return -1;
 189     }
 190
 191     /* no CUDA compatible device at all */
 192     if (dev_count == 0)
 193     {
 194         return -1;
 195     }
 196
 197     /* things might go horribly wrong if cudart is not compatible with the driver */
 198     if (dev_count < 0 || dev_count > cuda_max_device_count)
 199     {
 200         return -1;
 201     }
 202
 203     if (dev_id == -1) /* device already selected let's not destroy the context */
 204     {
 205         cu_err = cudaGetDevice(&id);
 206         if (cu_err != cudaSuccess)
 207         {
 208             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 209                     cudaGetErrorString(cu_err));
 210             return -1;
 211         }
 212     }
 213     else
 214     {
 215         id = dev_id;
 216         if (id > dev_count - 1) /* pfff there's no such device */
 217         {
 218             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 219                     dev_id, dev_count);
 220             return -1;
 221         }
 222     }
 223
 224     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 225     cu_err = cudaGetDeviceProperties(dev_prop, id);
 226     if (cu_err != cudaSuccess)
 227     {
 228         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 229                 cudaGetErrorString(cu_err));
 230         return -1;
 231     }
 232
 233     /* both major & minor is 9999 if no CUDA capable devices are present */
 234     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 235     {
 236         return -1;
 237     }
 238     /* we don't care about emulation mode */
 239     if (dev_prop->major == 0)
 240     {
 241         return -1;
 242     }
 243
 244     if (id != -1)
 245     {
 246         cu_err = cudaSetDevice(id);
 247         if (cu_err != cudaSuccess)
 248         {
 249             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 250                     cu_err, id, cudaGetErrorString(cu_err));
 251             return -1;
 252         }
 253     }
 254
 255     /* try to execute a dummy kernel */
 256     k_dummy_test<<< 1, 512>>> ();
 257     if (cudaThreadSynchronize() != cudaSuccess)
 258     {
 259         return -1;
 260     }
 261
 262     /* destroy context if we created one */
 263     if (id != -1)
 264     {
 265         cu_err = cudaDeviceReset();
 266         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 267     }
 268
 269     return 0;
 270 }
 271
 272 #if HAVE_NVML_APPLICATION_CLOCKS
 273 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 274  *
 275  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 276  * matching PCI-E information from \cuda_dev with the available NVML devices.
 277  *
 278  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 279  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 280  */
 281 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 282 {
 283     nvmlDevice_t nvml_device_id;
 284     unsigned int nvml_device_count  = 0;
 285     nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
 286     bool         nvmlWasInitialized = false;
 287     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 288     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 289     {
 290         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 291         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 292         if (nvml_stat != NVML_SUCCESS)
 293         {
 294             break;
 295         }
 296
 297         nvmlPciInfo_t nvml_pci_info;
 298         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 299         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 300         if (nvml_stat != NVML_SUCCESS)
 301         {
 302             break;
 303         }
 304         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 305             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 306             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 307         {
 308             nvmlWasInitialized         = true;
 309             cuda_dev->nvml_device_id   = nvml_device_id;
 310             break;
 311         }
 312     }
 313     return nvmlWasInitialized;
 314 }
 315
 316 /*! \brief Reads and returns the application clocks for device.
 317  *
 318  * \param[in]  device        The GPU device
 319  * \param[out] app_sm_clock  The current application SM clock
 320  * \param[out] app_mem_clock The current application memory clock
 321  * \returns if applacation clocks are supported
 322  */
 323 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 324                                  unsigned int            *app_sm_clock,
 325                                  unsigned int            *app_mem_clock)
 326 {
 327     nvmlReturn_t nvml_stat;
 328
 329     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 330     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 331     {
 332         return false;
 333     }
 334     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
 335     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 336     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
 337
 338     return true;
 339 }
 340 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 341
 342 /*! \brief Tries to set application clocks for the GPU with the given index.
 343  *
 344  * Application clocks are set to the max supported value to increase
 345  * performance if application clock permissions allow this. For future
 346  * GPU architectures a more sophisticated scheme might be required.
 347  *
 348  * \todo Refactor this into a detection phase and a work phase. Also
 349  * refactor to remove compile-time dependence on logging header.
 350  *
 351  * \param     mdlog         log file to write to
 352  * \param[in] cuda_dev      GPU device info for the GPU in use
 353  * \returns                 true if no error occurs during application clocks handling.
 354  */
 355 static gmx_bool init_gpu_application_clocks(
 356         const gmx::MDLogger &mdlog,
 357         gmx_device_info_t   *cuda_dev)
 358 {
 359     const cudaDeviceProp *prop                        = &cuda_dev->prop;
 360     int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
 361     gmx_bool              bGpuCanUseApplicationClocks =
 362         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
 363          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
 364     if (!bGpuCanUseApplicationClocks)
 365     {
 366         return true;
 367     }
 368 #if !HAVE_NVML
 369     GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 370             "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 371             "      application clocks of the detected %s GPU to improve performance.\n"
 372             "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
 373             prop->name);
 374     return true;
 375 #else
 376     if (!bCompiledWithApplicationClockSupport)
 377     {
 378         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 379                 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 380                 "      managing application clocks of the detected %s GPU to improve performance.\n"
 381                 "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
 382                 prop->name );
 383         return true;
 384     }
 385
 386     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 387     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 388     char        *env;
 389     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 390     //      this variable can be later used to give a user more fine grained control.
 391     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 392     if (env != NULL && ( strcmp( env, "0") == 0 ||
 393                          gmx_strcasecmp( env, "OFF") == 0 ||
 394                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 395     {
 396         return true;
 397     }
 398     nvml_stat = nvmlInit();
 399     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 400     if (nvml_stat != NVML_SUCCESS)
 401     {
 402         return false;
 403     }
 404
 405     if (!addNVMLDeviceId(cuda_dev))
 406     {
 407         return false;
 408     }
 409     //get current application clocks setting
 410     if (!getApplicationClocks(cuda_dev,
 411                               &cuda_dev->nvml_orig_app_sm_clock,
 412                               &cuda_dev->nvml_orig_app_mem_clock))
 413     {
 414         return false;
 415     }
 416     //get max application clocks
 417     unsigned int max_sm_clock  = 0;
 418     unsigned int max_mem_clock = 0;
 419     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 420     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 421     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 422     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 423
 424     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 425     cuda_dev->nvml_app_clocks_changed = false;
 426
 427     if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
 428     {
 429         //TODO: This should probably be integrated into the GPU Properties table.
 430         GMX_LOG(mdlog.info).appendTextFormatted(
 431                 "Application clocks (GPU clocks) for %s are (%d,%d)",
 432                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 433         return true;
 434     }
 435
 436     if (cuda_compute_capability >= 60)
 437     {
 438         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 439                 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
 440                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 441         return true;
 442     }
 443
 444     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 445     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 446
 447     if (nvml_stat != NVML_SUCCESS)
 448     {
 449         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 450                 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
 451                 nvml_stat, nvmlErrorString(nvml_stat));
 452         return false;
 453     }
 454
 455     if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
 456     {
 457         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 458                 "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
 459                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 460         return true;
 461     }
 462
 463     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 464        e.g. if max application clocks should not be used for certain GPUs. */
 465     GMX_LOG(mdlog.warning).appendTextFormatted(
 466             "Changing GPU application clocks for %s to (%d,%d)",
 467             cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 468     nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 469     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 470     cuda_dev->nvml_app_clocks_changed = true;
 471     cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 472     cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 473
 474     return true;
 475 #endif /* HAVE_NVML */
 476 }
 477
 478 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 479  *
 480  * \param[in] gpu_dev  CUDA device information
 481  */
 482 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 483 {
 484 #if !HAVE_NVML_APPLICATION_CLOCKS
 485     GMX_UNUSED_VALUE(cuda_dev);
 486     return true;
 487 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 488     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 489     if (cuda_dev &&
 490         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 491         cuda_dev->nvml_app_clocks_changed)
 492     {
 493         /* Check if the clocks are still what we set them to.
 494          * If so, set them back to the state we originally found them in.
 495          * If not, don't touch them, because something else set them later.
 496          */
 497         unsigned int app_sm_clock, app_mem_clock;
 498         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 499         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 500             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 501         {
 502             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 503             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
 504         }
 505     }
 506     nvml_stat = nvmlShutdown();
 507     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 508     return (nvml_stat == NVML_SUCCESS);
 509 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 510 }
 511
 512 void init_gpu(const gmx::MDLogger &mdlog,
 513               gmx_device_info_t   *deviceInfo)
 514 {
 515     cudaError_t stat;
 516
 517     assert(deviceInfo);
 518
 519     stat = cudaSetDevice(deviceInfo->id);
 520     if (stat != cudaSuccess)
 521     {
 522         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 523         CU_RET_ERR(stat, message.c_str());
 524     }
 525
 526     if (debug)
 527     {
 528         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 529     }
 530
 531     checkCompiledTargetCompatibility(deviceInfo);
 532
 533     //Ignoring return value as NVML errors should be treated not critical.
 534     init_gpu_application_clocks(mdlog, deviceInfo);
 535 }
 536
 537 void free_gpu(const gmx_device_info_t *deviceInfo)
 538 {
 539     // One should only attempt to clear the device context when
 540     // it has been used, but currently the only way to know that a GPU
 541     // device was used is that deviceInfo will be non-null.
 542     if (deviceInfo == nullptr)
 543     {
 544         return;
 545     }
 546
 547     cudaError_t  stat;
 548
 549     if (debug)
 550     {
 551         int gpuid;
 552         stat = cudaGetDevice(&gpuid);
 553         CU_RET_ERR(stat, "cudaGetDevice failed");
 554         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 555     }
 556
 557     if (!reset_gpu_application_clocks(deviceInfo))
 558     {
 559         gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
 560     }
 561
 562     stat = cudaDeviceReset();
 563     if (stat != cudaSuccess)
 564     {
 565         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 566     }
 567 }
 568
 569 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 570                                  int                   deviceId)
 571 {
 572     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 573     {
 574         gmx_incons("Invalid GPU deviceId requested");
 575     }
 576     return &gpu_info.gpu_dev[deviceId];
 577 }
 578
 579 /*! \brief Returns true if the gpu characterized by the device properties is
 580  *  supported by the native gpu acceleration.
 581  *
 582  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 583  * \returns             true if the GPU properties passed indicate a compatible
 584  *                      GPU, otherwise false.
 585  */
 586 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 587 {
 588     return (dev_prop->major >= 2);
 589 }
 590
 591 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 592  *
 593  *  Returns a status value which indicates compatibility or one of the following
 594  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 595  *  It also returns the respective device's properties in \dev_prop (if applicable).
 596  *
 597  *  \param[in]  dev_id   the ID of the GPU to check.
 598  *  \param[out] dev_prop the CUDA device properties of the device checked.
 599  *  \returns             the status of the requested device
 600  */
 601 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 602 {
 603     cudaError_t stat;
 604     int         ndev;
 605
 606     stat = cudaGetDeviceCount(&ndev);
 607     if (stat != cudaSuccess)
 608     {
 609         return egpuInsane;
 610     }
 611
 612     if (dev_id > ndev - 1)
 613     {
 614         return egpuNonexistent;
 615     }
 616
 617     /* TODO: currently we do not make a distinction between the type of errors
 618      * that can appear during sanity checks. This needs to be improved, e.g if
 619      * the dummy test kernel fails to execute with a "device busy message" we
 620      * should appropriately report that the device is busy instead of insane.
 621      */
 622     if (do_sanity_checks(dev_id, dev_prop) == 0)
 623     {
 624         if (is_gmx_supported_gpu(dev_prop))
 625         {
 626             return egpuCompatible;
 627         }
 628         else
 629         {
 630             return egpuIncompatible;
 631         }
 632     }
 633     else
 634     {
 635         return egpuInsane;
 636     }
 637 }
 638
 639
 640 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 641 {
 642     int                i, ndev, checkres, retval;
 643     cudaError_t        stat;
 644     cudaDeviceProp     prop;
 645     gmx_device_info_t *devs;
 646
 647     assert(gpu_info);
 648     assert(err_str);
 649
 650     gpu_info->n_dev_compatible = 0;
 651
 652     ndev    = 0;
 653     devs    = NULL;
 654
 655     stat = cudaGetDeviceCount(&ndev);
 656     if (stat != cudaSuccess)
 657     {
 658         const char *s;
 659
 660         /* cudaGetDeviceCount failed which means that there is something
 661          * wrong with the machine: driver-runtime mismatch, all GPUs being
 662          * busy in exclusive mode, or some other condition which should
 663          * result in us issuing a warning a falling back to CPUs. */
 664         retval = -1;
 665         s      = cudaGetErrorString(stat);
 666         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 667     }
 668     else
 669     {
 670         snew(devs, ndev);
 671         for (i = 0; i < ndev; i++)
 672         {
 673             checkres = is_gmx_supported_gpu_id(i, &prop);
 674
 675             devs[i].id   = i;
 676             devs[i].prop = prop;
 677             devs[i].stat = checkres;
 678
 679             if (checkres == egpuCompatible)
 680             {
 681                 gpu_info->n_dev_compatible++;
 682             }
 683         }
 684         retval = 0;
 685     }
 686
 687     gpu_info->n_dev   = ndev;
 688     gpu_info->gpu_dev = devs;
 689
 690     return retval;
 691 }
 692
 693 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
 694 {
 695     // Possible minor over-allocation here, but not important for anything
 696     std::vector<int> compatibleGpus;
 697     compatibleGpus.reserve(gpu_info.n_dev);
 698     for (int i = 0; i < gpu_info.n_dev; i++)
 699     {
 700         assert(gpu_info.gpu_dev);
 701         if (gpu_info.gpu_dev[i].stat == egpuCompatible)
 702         {
 703             compatibleGpus.push_back(i);
 704         }
 705     }
 706     return compatibleGpus;
 707 }
 708
 709 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &gpu_info,
 710                                            int                   index)
 711 {
 712     return (index >= gpu_info.n_dev ?
 713             gpu_detect_res_str[egpuNonexistent] :
 714             gpu_detect_res_str[gpu_info.gpu_dev[index].stat]);
 715 }
 716
 717 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 718 {
 719     if (gpu_info == NULL)
 720     {
 721         return;
 722     }
 723
 724     sfree(gpu_info->gpu_dev);
 725 }
 726
 727 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 728 {
 729     assert(s);
 730
 731     if (index < 0 && index >= gpu_info.n_dev)
 732     {
 733         return;
 734     }
 735
 736     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 737
 738     bool               bGpuExists =
 739         dinfo->stat == egpuCompatible ||
 740         dinfo->stat == egpuIncompatible;
 741
 742     if (!bGpuExists)
 743     {
 744         sprintf(s, "#%d: %s, stat: %s",
 745                 dinfo->id, "N/A",
 746                 gpu_detect_res_str[dinfo->stat]);
 747     }
 748     else
 749     {
 750         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 751                 dinfo->id, dinfo->prop.name,
 752                 dinfo->prop.major, dinfo->prop.minor,
 753                 dinfo->prop.ECCEnabled ? "yes" : " no",
 754                 gpu_detect_res_str[dinfo->stat]);
 755     }
 756 }
 757
 758 int get_current_cuda_gpu_device_id(void)
 759 {
 760     int gpuid;
 761     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 762
 763     return gpuid;
 764 }
 765
 766 size_t sizeof_gpu_dev_info(void)
 767 {
 768     return sizeof(gmx_device_info_t);
 769 }
 770
 771 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 772                                   gmx_host_alloc_t **nb_alloc,
 773                                   gmx_host_free_t  **nb_free)
 774 {
 775     if (bUseGpuKernels)
 776     {
 777         *nb_alloc = &pmalloc;
 778         *nb_free  = &pfree;
 779     }
 780     else
 781     {
 782         *nb_alloc = NULL;
 783         *nb_free  = NULL;
 784     }
 785 }
 786
 787 void startGpuProfiler(void)
 788 {
 789     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 790        mdrun is executed in the CUDA profiler.
 791        If nvprof was run is with "--profile-from-start off", the profiler will
 792        be started here. This way we can avoid tracing the CUDA events from the
 793        first part of the run. Starting the profiler again does nothing.
 794      */
 795     if (cudaProfilerRun)
 796     {
 797         cudaError_t stat;
 798         stat = cudaProfilerStart();
 799         CU_RET_ERR(stat, "cudaProfilerStart failed");
 800     }
 801 }
 802
 803 void stopGpuProfiler(void)
 804 {
 805     /* Stopping the nvidia here allows us to eliminate the subsequent
 806        API calls from the trace, e.g. uninitialization and cleanup. */
 807     if (cudaProfilerRun)
 808     {
 809         cudaError_t stat;
 810         stat = cudaProfilerStop();
 811         CU_RET_ERR(stat, "cudaProfilerStop failed");
 812     }
 813 }
 814
 815 void resetGpuProfiler(void)
 816 {
 817     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 818      *  the profiling here (can't stop it) which will achieve the desired effect if
 819      *  the run was started with the profiling disabled.
 820      *
 821      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 822      * stopGpuProfiler();
 823      */
 824     if (cudaProfilerRun)
 825     {
 826         startGpuProfiler();
 827     }
 828 }