src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include <cuda_profiler_api.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  55 #include "gromacs/hardware/gpu_hw_info.h"
  56 #include "gromacs/utility/basedefinitions.h"
  57 #include "gromacs/utility/cstringutil.h"
  58 #include "gromacs/utility/exceptions.h"
  59 #include "gromacs/utility/fatalerror.h"
  60 #include "gromacs/utility/gmxassert.h"
  61 #include "gromacs/utility/logger.h"
  62 #include "gromacs/utility/programcontext.h"
  63 #include "gromacs/utility/smalloc.h"
  64 #include "gromacs/utility/snprintf.h"
  65 #include "gromacs/utility/stringutil.h"
  66
  67 #if HAVE_NVML
  68 #include <nvml.h>
  69 #define HAVE_NVML_APPLICATION_CLOCKS (NVML_API_VERSION >= 6)
  70 #else  /* HAVE_NVML */
  71 #define HAVE_NVML_APPLICATION_CLOCKS 0
  72 #endif /* HAVE_NVML */
  73
  74 #if defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS
  75 /*! Check for NVML error on the return status of a NVML API call. */
  76 #  define HANDLE_NVML_RET_ERR(status, msg) \
  77     do { \
  78         if (status != NVML_SUCCESS) \
  79         { \
  80             gmx_warning("%s: %s\n", msg, nvmlErrorString(status)); \
  81         } \
  82     } while (0)
  83 #else  /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  84 #  define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
  85 #endif /* defined(CHECK_CUDA_ERRORS) && HAVE_NVML_APPLICATION_CLOCKS */
  86
  87 #if HAVE_NVML_APPLICATION_CLOCKS
  88 static const gmx_bool            bCompiledWithApplicationClockSupport = true;
  89 #else
  90 static const gmx_bool gmx_unused bCompiledWithApplicationClockSupport = false;
  91 #endif
  92
  93 /*! \internal \brief
  94  * Max number of devices supported by CUDA (for consistency checking).
  95  *
  96  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  97  */
  98 static int  cuda_max_device_count = 32;
  99
 100 static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
 101
 102 /** Dummy kernel used for sanity checking. */
 103 static __global__ void k_dummy_test(void)
 104 {
 105 }
 106
 107 static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
 108 {
 109     assert(devInfo);
 110
 111     cudaFuncAttributes attributes;
 112     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
 113
 114     if (cudaErrorInvalidDeviceFunction == stat)
 115     {
 116         gmx_fatal(FARGS,
 117                   "The %s binary does not include support for the CUDA architecture "
 118                   "of the selected GPU (device ID #%d, compute capability %d.%d). "
 119                   "By default, GROMACS supports all common architectures, so your GPU "
 120                   "might be rare, or some architectures were disabled in the build. ",
 121                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and ",
 122                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
 123                   gmx::getProgramContext().displayName(), devInfo->id,
 124                   devInfo->prop.major, devInfo->prop.minor);
 125     }
 126
 127     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
 128
 129     if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
 130     {
 131         gmx_fatal(FARGS,
 132                   "The GPU device code was compiled at runtime from 2.0 source which is "
 133                   "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
 134                   "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
 135                   devInfo->id,
 136                   devInfo->prop.major, devInfo->prop.minor);
 137     }
 138 }
 139
 140 bool isHostMemoryPinned(void *h_ptr)
 141 {
 142     cudaPointerAttributes memoryAttributes;
 143     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 144
 145     bool                  result = false;
 146     switch (stat)
 147     {
 148         case cudaSuccess:
 149             result = true;
 150             break;
 151
 152         case cudaErrorInvalidValue:
 153             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 154             result = false;
 155             // Reset the last error status
 156             cudaGetLastError();
 157             break;
 158
 159         default:
 160             CU_RET_ERR(stat, "Unexpected CUDA error");
 161     }
 162     return result;
 163 }
 164
 165 /*!
 166  * \brief Runs GPU sanity checks.
 167  *
 168  * Runs a series of checks to determine that the given GPU and underlying CUDA
 169  * driver/runtime functions properly.
 170  * Returns properties of a device with given ID or the one that has
 171  * already been initialized earlier in the case if of \dev_id == -1.
 172  *
 173  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 174  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 175  * \returns                0 if the device looks OK
 176  *
 177  * TODO: introduce errors codes and handle errors more smoothly.
 178  */
 179 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 180 {
 181     cudaError_t cu_err;
 182     int         dev_count, id;
 183
 184     cu_err = cudaGetDeviceCount(&dev_count);
 185     if (cu_err != cudaSuccess)
 186     {
 187         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 188                 cudaGetErrorString(cu_err));
 189         return -1;
 190     }
 191
 192     /* no CUDA compatible device at all */
 193     if (dev_count == 0)
 194     {
 195         return -1;
 196     }
 197
 198     /* things might go horribly wrong if cudart is not compatible with the driver */
 199     if (dev_count < 0 || dev_count > cuda_max_device_count)
 200     {
 201         return -1;
 202     }
 203
 204     if (dev_id == -1) /* device already selected let's not destroy the context */
 205     {
 206         cu_err = cudaGetDevice(&id);
 207         if (cu_err != cudaSuccess)
 208         {
 209             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 210                     cudaGetErrorString(cu_err));
 211             return -1;
 212         }
 213     }
 214     else
 215     {
 216         id = dev_id;
 217         if (id > dev_count - 1) /* pfff there's no such device */
 218         {
 219             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 220                     dev_id, dev_count);
 221             return -1;
 222         }
 223     }
 224
 225     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 226     cu_err = cudaGetDeviceProperties(dev_prop, id);
 227     if (cu_err != cudaSuccess)
 228     {
 229         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 230                 cudaGetErrorString(cu_err));
 231         return -1;
 232     }
 233
 234     /* both major & minor is 9999 if no CUDA capable devices are present */
 235     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 236     {
 237         return -1;
 238     }
 239     /* we don't care about emulation mode */
 240     if (dev_prop->major == 0)
 241     {
 242         return -1;
 243     }
 244
 245     if (id != -1)
 246     {
 247         cu_err = cudaSetDevice(id);
 248         if (cu_err != cudaSuccess)
 249         {
 250             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 251                     cu_err, id, cudaGetErrorString(cu_err));
 252             return -1;
 253         }
 254     }
 255
 256     /* try to execute a dummy kernel */
 257     k_dummy_test<<< 1, 512>>> ();
 258     if (cudaThreadSynchronize() != cudaSuccess)
 259     {
 260         return -1;
 261     }
 262
 263     /* destroy context if we created one */
 264     if (id != -1)
 265     {
 266         cu_err = cudaDeviceReset();
 267         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 268     }
 269
 270     return 0;
 271 }
 272
 273 #if HAVE_NVML_APPLICATION_CLOCKS
 274 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 275  *
 276  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 277  * matching PCI-E information from \cuda_dev with the available NVML devices.
 278  *
 279  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 280  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 281  */
 282 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 283 {
 284     nvmlDevice_t nvml_device_id;
 285     unsigned int nvml_device_count  = 0;
 286     nvmlReturn_t nvml_stat          = nvmlDeviceGetCount ( &nvml_device_count );
 287     bool         nvmlWasInitialized = false;
 288     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 289     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 290     {
 291         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 292         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 293         if (nvml_stat != NVML_SUCCESS)
 294         {
 295             break;
 296         }
 297
 298         nvmlPciInfo_t nvml_pci_info;
 299         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 300         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 301         if (nvml_stat != NVML_SUCCESS)
 302         {
 303             break;
 304         }
 305         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 306             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 307             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 308         {
 309             nvmlWasInitialized         = true;
 310             cuda_dev->nvml_device_id   = nvml_device_id;
 311             break;
 312         }
 313     }
 314     return nvmlWasInitialized;
 315 }
 316
 317 /*! \brief Reads and returns the application clocks for device.
 318  *
 319  * \param[in]  device        The GPU device
 320  * \param[out] app_sm_clock  The current application SM clock
 321  * \param[out] app_mem_clock The current application memory clock
 322  * \returns if applacation clocks are supported
 323  */
 324 static bool getApplicationClocks(const gmx_device_info_t *cuda_dev,
 325                                  unsigned int            *app_sm_clock,
 326                                  unsigned int            *app_mem_clock)
 327 {
 328     nvmlReturn_t nvml_stat;
 329
 330     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_SM, app_sm_clock);
 331     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 332     {
 333         return false;
 334     }
 335     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_SM");
 336     nvml_stat = nvmlDeviceGetApplicationsClock(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, app_mem_clock);
 337     HANDLE_NVML_RET_ERR(nvml_stat, "nvmlDeviceGetApplicationsClock failed for NVIDIA_CLOCK_MEM");
 338
 339     return true;
 340 }
 341 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 342
 343 /*! \brief Tries to set application clocks for the GPU with the given index.
 344  *
 345  * Application clocks are set to the max supported value to increase
 346  * performance if application clock permissions allow this. For future
 347  * GPU architectures a more sophisticated scheme might be required.
 348  *
 349  * \todo Refactor this into a detection phase and a work phase. Also
 350  * refactor to remove compile-time dependence on logging header.
 351  *
 352  * \param     mdlog         log file to write to
 353  * \param[in] cuda_dev      GPU device info for the GPU in use
 354  * \returns                 true if no error occurs during application clocks handling.
 355  */
 356 static gmx_bool init_gpu_application_clocks(
 357         const gmx::MDLogger &mdlog,
 358         gmx_device_info_t   *cuda_dev)
 359 {
 360     const cudaDeviceProp *prop                        = &cuda_dev->prop;
 361     int                   cuda_compute_capability     = prop->major * 10 + prop->minor;
 362     gmx_bool              bGpuCanUseApplicationClocks =
 363         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_compute_capability >= 35 ) ||
 364          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_compute_capability >= 52 ));
 365     if (!bGpuCanUseApplicationClocks)
 366     {
 367         return true;
 368     }
 369 #if !HAVE_NVML
 370     GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 371             "NOTE: GROMACS was configured without NVML support hence it can not exploit\n"
 372             "      application clocks of the detected %s GPU to improve performance.\n"
 373             "      Recompile with the NVML library (compatible with the driver used) or set application clocks manually.",
 374             prop->name);
 375     return true;
 376 #else
 377     if (!bCompiledWithApplicationClockSupport)
 378     {
 379         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 380                 "NOTE: GROMACS was compiled with an old NVML library which does not support\n"
 381                 "      managing application clocks of the detected %s GPU to improve performance.\n"
 382                 "      If your GPU supports application clocks, upgrade NVML (and driver) and recompile or set the clocks manually.",
 383                 prop->name );
 384         return true;
 385     }
 386
 387     /* We've compiled with NVML application clocks support, and have a GPU that can use it */
 388     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 389     char        *env;
 390     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 391     //      this variable can be later used to give a user more fine grained control.
 392     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 393     if (env != NULL && ( strcmp( env, "0") == 0 ||
 394                          gmx_strcasecmp( env, "OFF") == 0 ||
 395                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 396     {
 397         return true;
 398     }
 399     nvml_stat = nvmlInit();
 400     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 401     if (nvml_stat != NVML_SUCCESS)
 402     {
 403         return false;
 404     }
 405
 406     if (!addNVMLDeviceId(cuda_dev))
 407     {
 408         return false;
 409     }
 410     //get current application clocks setting
 411     if (!getApplicationClocks(cuda_dev,
 412                               &cuda_dev->nvml_orig_app_sm_clock,
 413                               &cuda_dev->nvml_orig_app_mem_clock))
 414     {
 415         return false;
 416     }
 417     //get max application clocks
 418     unsigned int max_sm_clock  = 0;
 419     unsigned int max_mem_clock = 0;
 420     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_SM, &max_sm_clock);
 421     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 422     nvml_stat = nvmlDeviceGetMaxClockInfo(cuda_dev->nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock);
 423     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 424
 425     cuda_dev->nvml_is_restricted      = NVML_FEATURE_ENABLED;
 426     cuda_dev->nvml_app_clocks_changed = false;
 427
 428     if (cuda_dev->nvml_orig_app_sm_clock >= max_sm_clock)
 429     {
 430         //TODO: This should probably be integrated into the GPU Properties table.
 431         GMX_LOG(mdlog.info).appendTextFormatted(
 432                 "Application clocks (GPU clocks) for %s are (%d,%d)",
 433                 cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 434         return true;
 435     }
 436
 437     if (cuda_compute_capability >= 60)
 438     {
 439         // Only warn about not being able to change clocks if they are not already at the max values
 440         if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
 441         {
 442             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 443                     "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nPlease contact your admin to change application clocks.\n",
 444                     cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 445         }
 446         return true;
 447     }
 448
 449     nvml_stat = nvmlDeviceGetAPIRestriction(cuda_dev->nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(cuda_dev->nvml_is_restricted));
 450     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 451
 452     if (nvml_stat != NVML_SUCCESS)
 453     {
 454         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 455                 "Cannot change GPU application clocks to optimal values due to NVML error (%d): %s.",
 456                 nvml_stat, nvmlErrorString(nvml_stat));
 457         return false;
 458     }
 459
 460     if (cuda_dev->nvml_is_restricted != NVML_FEATURE_DISABLED)
 461     {
 462         // Only warn about not being able to change clocks if they are not already at the max values
 463         if (max_mem_clock > cuda_dev->nvml_orig_app_mem_clock || max_sm_clock > cuda_dev->nvml_orig_app_sm_clock)
 464         {
 465             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 466                     "Cannot change application clocks for %s to optimal values due to insufficient permissions. Current values are (%d,%d), max values are (%d,%d).\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clocks.",
 467                     cuda_dev->prop.name, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock, max_mem_clock, max_sm_clock);
 468         }
 469         return true;
 470     }
 471
 472     /* Note: Distinguishing between different types of GPUs here might be necessary in the future,
 473        e.g. if max application clocks should not be used for certain GPUs. */
 474     GMX_LOG(mdlog.warning).appendTextFormatted(
 475             "Changing GPU application clocks for %s to (%d,%d)",
 476             cuda_dev->prop.name, max_mem_clock, max_sm_clock);
 477     nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, max_mem_clock, max_sm_clock);
 478     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 479     cuda_dev->nvml_app_clocks_changed = true;
 480     cuda_dev->nvml_set_app_sm_clock   = max_sm_clock;
 481     cuda_dev->nvml_set_app_mem_clock  = max_mem_clock;
 482
 483     return true;
 484 #endif /* HAVE_NVML */
 485 }
 486
 487 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 488  *
 489  * \param[in] gpu_dev  CUDA device information
 490  */
 491 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 492 {
 493 #if !HAVE_NVML_APPLICATION_CLOCKS
 494     GMX_UNUSED_VALUE(cuda_dev);
 495     return true;
 496 #else /* HAVE_NVML_APPLICATION_CLOCKS */
 497     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 498     if (cuda_dev &&
 499         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 500         cuda_dev->nvml_app_clocks_changed)
 501     {
 502         /* Check if the clocks are still what we set them to.
 503          * If so, set them back to the state we originally found them in.
 504          * If not, don't touch them, because something else set them later.
 505          */
 506         unsigned int app_sm_clock, app_mem_clock;
 507         getApplicationClocks(cuda_dev, &app_sm_clock, &app_mem_clock);
 508         if (app_sm_clock  == cuda_dev->nvml_set_app_sm_clock &&
 509             app_mem_clock == cuda_dev->nvml_set_app_mem_clock)
 510         {
 511             nvml_stat = nvmlDeviceSetApplicationsClocks(cuda_dev->nvml_device_id, cuda_dev->nvml_orig_app_mem_clock, cuda_dev->nvml_orig_app_sm_clock);
 512             HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceSetApplicationsClock failed" );
 513         }
 514     }
 515     nvml_stat = nvmlShutdown();
 516     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 517     return (nvml_stat == NVML_SUCCESS);
 518 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 519 }
 520
 521 void init_gpu(const gmx::MDLogger &mdlog,
 522               gmx_device_info_t   *deviceInfo)
 523 {
 524     cudaError_t stat;
 525
 526     assert(deviceInfo);
 527
 528     stat = cudaSetDevice(deviceInfo->id);
 529     if (stat != cudaSuccess)
 530     {
 531         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 532         CU_RET_ERR(stat, message.c_str());
 533     }
 534
 535     if (debug)
 536     {
 537         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 538     }
 539
 540     checkCompiledTargetCompatibility(deviceInfo);
 541
 542     //Ignoring return value as NVML errors should be treated not critical.
 543     init_gpu_application_clocks(mdlog, deviceInfo);
 544 }
 545
 546 void free_gpu(const gmx_device_info_t *deviceInfo)
 547 {
 548     // One should only attempt to clear the device context when
 549     // it has been used, but currently the only way to know that a GPU
 550     // device was used is that deviceInfo will be non-null.
 551     if (deviceInfo == nullptr)
 552     {
 553         return;
 554     }
 555
 556     cudaError_t  stat;
 557
 558     if (debug)
 559     {
 560         int gpuid;
 561         stat = cudaGetDevice(&gpuid);
 562         CU_RET_ERR(stat, "cudaGetDevice failed");
 563         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 564     }
 565
 566     if (!reset_gpu_application_clocks(deviceInfo))
 567     {
 568         gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
 569     }
 570
 571     stat = cudaDeviceReset();
 572     if (stat != cudaSuccess)
 573     {
 574         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 575     }
 576 }
 577
 578 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 579                                  int                   deviceId)
 580 {
 581     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 582     {
 583         gmx_incons("Invalid GPU deviceId requested");
 584     }
 585     return &gpu_info.gpu_dev[deviceId];
 586 }
 587
 588 /*! \brief Returns true if the gpu characterized by the device properties is
 589  *  supported by the native gpu acceleration.
 590  *
 591  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 592  * \returns             true if the GPU properties passed indicate a compatible
 593  *                      GPU, otherwise false.
 594  */
 595 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 596 {
 597     return (dev_prop->major >= 2);
 598 }
 599
 600 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 601  *
 602  *  Returns a status value which indicates compatibility or one of the following
 603  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 604  *  It also returns the respective device's properties in \dev_prop (if applicable).
 605  *
 606  *  \param[in]  dev_id   the ID of the GPU to check.
 607  *  \param[out] dev_prop the CUDA device properties of the device checked.
 608  *  \returns             the status of the requested device
 609  */
 610 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 611 {
 612     cudaError_t stat;
 613     int         ndev;
 614
 615     stat = cudaGetDeviceCount(&ndev);
 616     if (stat != cudaSuccess)
 617     {
 618         return egpuInsane;
 619     }
 620
 621     if (dev_id > ndev - 1)
 622     {
 623         return egpuNonexistent;
 624     }
 625
 626     /* TODO: currently we do not make a distinction between the type of errors
 627      * that can appear during sanity checks. This needs to be improved, e.g if
 628      * the dummy test kernel fails to execute with a "device busy message" we
 629      * should appropriately report that the device is busy instead of insane.
 630      */
 631     if (do_sanity_checks(dev_id, dev_prop) == 0)
 632     {
 633         if (is_gmx_supported_gpu(dev_prop))
 634         {
 635             return egpuCompatible;
 636         }
 637         else
 638         {
 639             return egpuIncompatible;
 640         }
 641     }
 642     else
 643     {
 644         return egpuInsane;
 645     }
 646 }
 647
 648 bool canDetectGpus(std::string *errorMessage)
 649 {
 650     cudaError_t        stat;
 651     int                driverVersion = -1;
 652     stat = cudaDriverGetVersion(&driverVersion);
 653     GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
 654     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 655                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 656                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
 657     bool foundDriver = (driverVersion > 0);
 658     if (!foundDriver)
 659     {
 660         // Can't detect GPUs if there is no driver
 661         if (errorMessage != nullptr)
 662         {
 663             errorMessage->assign("No valid CUDA driver found");
 664         }
 665         return false;
 666     }
 667
 668     int numDevices;
 669     stat = cudaGetDeviceCount(&numDevices);
 670     if (stat != cudaSuccess)
 671     {
 672         if (errorMessage != nullptr)
 673         {
 674             /* cudaGetDeviceCount failed which means that there is
 675              * something wrong with the machine: driver-runtime
 676              * mismatch, all GPUs being busy in exclusive mode,
 677              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 678              * which should result in GROMACS issuing a warning a
 679              * falling back to CPUs. */
 680             errorMessage->assign(cudaGetErrorString(stat));
 681         }
 682
 683         // Consume the error now that we have prepared to handle
 684         // it. This stops it reappearing next time we check for
 685         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 686         // valid devices, then cudaGetLastError returns the
 687         // (undocumented) cudaErrorNoDevice, but this should not be a
 688         // problem as there should be no future CUDA API calls.
 689         // NVIDIA bug report #2038718 has been filed.
 690         cudaGetLastError();
 691         // Can't detect GPUs
 692         return false;
 693     }
 694
 695     // We don't actually use numDevices here, that's not the job of
 696     // this function.
 697     return true;
 698 }
 699
 700 void findGpus(gmx_gpu_info_t *gpu_info)
 701 {
 702     int                i, ndev, checkres;
 703     cudaError_t        stat;
 704     cudaDeviceProp     prop;
 705     gmx_device_info_t *devs;
 706
 707     assert(gpu_info);
 708
 709     gpu_info->n_dev_compatible = 0;
 710
 711     ndev    = 0;
 712     devs    = NULL;
 713
 714     stat = cudaGetDeviceCount(&ndev);
 715     if (stat != cudaSuccess)
 716     {
 717         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
 718                                      "canDetectGpus() was not called appropriately beforehand."));
 719     }
 720
 721     snew(devs, ndev);
 722     for (i = 0; i < ndev; i++)
 723     {
 724         checkres = is_gmx_supported_gpu_id(i, &prop);
 725
 726         devs[i].id   = i;
 727         devs[i].prop = prop;
 728         devs[i].stat = checkres;
 729
 730         if (checkres == egpuCompatible)
 731         {
 732             gpu_info->n_dev_compatible++;
 733         }
 734     }
 735     GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
 736
 737     gpu_info->n_dev   = ndev;
 738     gpu_info->gpu_dev = devs;
 739 }
 740
 741 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
 742 {
 743     // Possible minor over-allocation here, but not important for anything
 744     std::vector<int> compatibleGpus;
 745     compatibleGpus.reserve(gpu_info.n_dev);
 746     for (int i = 0; i < gpu_info.n_dev; i++)
 747     {
 748         assert(gpu_info.gpu_dev);
 749         if (gpu_info.gpu_dev[i].stat == egpuCompatible)
 750         {
 751             compatibleGpus.push_back(i);
 752         }
 753     }
 754     return compatibleGpus;
 755 }
 756
 757 const char *getGpuCompatibilityDescription(const gmx_gpu_info_t &gpu_info,
 758                                            int                   index)
 759 {
 760     return (index >= gpu_info.n_dev ?
 761             gpu_detect_res_str[egpuNonexistent] :
 762             gpu_detect_res_str[gpu_info.gpu_dev[index].stat]);
 763 }
 764
 765 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 766 {
 767     if (gpu_info == NULL)
 768     {
 769         return;
 770     }
 771
 772     sfree(gpu_info->gpu_dev);
 773 }
 774
 775 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
 776 {
 777     assert(s);
 778
 779     if (index < 0 && index >= gpu_info.n_dev)
 780     {
 781         return;
 782     }
 783
 784     gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
 785
 786     bool               bGpuExists =
 787         dinfo->stat == egpuCompatible ||
 788         dinfo->stat == egpuIncompatible;
 789
 790     if (!bGpuExists)
 791     {
 792         sprintf(s, "#%d: %s, stat: %s",
 793                 dinfo->id, "N/A",
 794                 gpu_detect_res_str[dinfo->stat]);
 795     }
 796     else
 797     {
 798         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 799                 dinfo->id, dinfo->prop.name,
 800                 dinfo->prop.major, dinfo->prop.minor,
 801                 dinfo->prop.ECCEnabled ? "yes" : " no",
 802                 gpu_detect_res_str[dinfo->stat]);
 803     }
 804 }
 805
 806 int get_current_cuda_gpu_device_id(void)
 807 {
 808     int gpuid;
 809     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 810
 811     return gpuid;
 812 }
 813
 814 size_t sizeof_gpu_dev_info(void)
 815 {
 816     return sizeof(gmx_device_info_t);
 817 }
 818
 819 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 820                                   gmx_host_alloc_t **nb_alloc,
 821                                   gmx_host_free_t  **nb_free)
 822 {
 823     if (bUseGpuKernels)
 824     {
 825         *nb_alloc = &pmalloc;
 826         *nb_free  = &pfree;
 827     }
 828     else
 829     {
 830         *nb_alloc = NULL;
 831         *nb_free  = NULL;
 832     }
 833 }
 834
 835 void startGpuProfiler(void)
 836 {
 837     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 838        mdrun is executed in the CUDA profiler.
 839        If nvprof was run is with "--profile-from-start off", the profiler will
 840        be started here. This way we can avoid tracing the CUDA events from the
 841        first part of the run. Starting the profiler again does nothing.
 842      */
 843     if (cudaProfilerRun)
 844     {
 845         cudaError_t stat;
 846         stat = cudaProfilerStart();
 847         CU_RET_ERR(stat, "cudaProfilerStart failed");
 848     }
 849 }
 850
 851 void stopGpuProfiler(void)
 852 {
 853     /* Stopping the nvidia here allows us to eliminate the subsequent
 854        API calls from the trace, e.g. uninitialization and cleanup. */
 855     if (cudaProfilerRun)
 856     {
 857         cudaError_t stat;
 858         stat = cudaProfilerStop();
 859         CU_RET_ERR(stat, "cudaProfilerStop failed");
 860     }
 861 }
 862
 863 void resetGpuProfiler(void)
 864 {
 865     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 866      *  the profiling here (can't stop it) which will achieve the desired effect if
 867      *  the run was started with the profiling disabled.
 868      *
 869      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 870      * stopGpuProfiler();
 871      */
 872     if (cudaProfilerRun)
 873     {
 874         startGpuProfiler();
 875     }
 876 }