src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define functions for detection and initialization for CUDA devices.
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40
  41 #include "gmxpre.h"
  42
  43 #include "gpu_utils.h"
  44
  45 #include "config.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/legacyheaders/types/hw_info.h"
  54 #include "gromacs/utility/basedefinitions.h"
  55 #include "gromacs/utility/cstringutil.h"
  56 #include "gromacs/utility/smalloc.h"
  57
  58 /*! \internal \brief
  59  * Max number of devices supported by CUDA (for consistency checking).
  60  *
  61  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  62  */
  63 static int cuda_max_device_count = 32;
  64
  65 /** Dummy kernel used for sanity checking. */
  66 __global__ void k_dummy_test()
  67 {
  68 }
  69
  70
  71 /*!
  72  * \brief Runs GPU sanity checks.
  73  *
  74  * Runs a series of checks to determine that the given GPU and underlying CUDA
  75  * driver/runtime functions properly.
  76  * Returns properties of a device with given ID or the one that has
  77  * already been initialized earlier in the case if of \dev_id == -1.
  78  *
  79  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
  80  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
  81  * \returns                0 if the device looks OK
  82  *
  83  * TODO: introduce errors codes and handle errors more smoothly.
  84  */
  85 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  86 {
  87     cudaError_t cu_err;
  88     int         dev_count, id;
  89
  90     cu_err = cudaGetDeviceCount(&dev_count);
  91     if (cu_err != cudaSuccess)
  92     {
  93         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
  94                 cudaGetErrorString(cu_err));
  95         return -1;
  96     }
  97
  98     /* no CUDA compatible device at all */
  99     if (dev_count == 0)
 100     {
 101         return -1;
 102     }
 103
 104     /* things might go horribly wrong if cudart is not compatible with the driver */
 105     if (dev_count < 0 || dev_count > cuda_max_device_count)
 106     {
 107         return -1;
 108     }
 109
 110     if (dev_id == -1) /* device already selected let's not destroy the context */
 111     {
 112         cu_err = cudaGetDevice(&id);
 113         if (cu_err != cudaSuccess)
 114         {
 115             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 116                     cudaGetErrorString(cu_err));
 117             return -1;
 118         }
 119     }
 120     else
 121     {
 122         id = dev_id;
 123         if (id > dev_count - 1) /* pfff there's no such device */
 124         {
 125             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 126                     dev_id, dev_count);
 127             return -1;
 128         }
 129     }
 130
 131     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 132     cu_err = cudaGetDeviceProperties(dev_prop, id);
 133     if (cu_err != cudaSuccess)
 134     {
 135         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 136                 cudaGetErrorString(cu_err));
 137         return -1;
 138     }
 139
 140     /* both major & minor is 9999 if no CUDA capable devices are present */
 141     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 142     {
 143         return -1;
 144     }
 145     /* we don't care about emulation mode */
 146     if (dev_prop->major == 0)
 147     {
 148         return -1;
 149     }
 150
 151     if (id != -1)
 152     {
 153         cu_err = cudaSetDevice(id);
 154         if (cu_err != cudaSuccess)
 155         {
 156             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 157                     cu_err, id, cudaGetErrorString(cu_err));
 158             return -1;
 159         }
 160     }
 161
 162     /* try to execute a dummy kernel */
 163     k_dummy_test<<< 1, 512>>> ();
 164     if (cudaThreadSynchronize() != cudaSuccess)
 165     {
 166         return -1;
 167     }
 168
 169     /* destroy context if we created one */
 170     if (id != -1)
 171     {
 172         cu_err = cudaDeviceReset();
 173         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 174     }
 175
 176     return 0;
 177 }
 178
 179 #ifdef HAVE_NVML
 180 /* TODO: We should actually be using md_print_warn in md_logging.c,
 181  * but we can't include mpi.h in CUDA code.
 182  */
 183 static void md_print_info(FILE       *fplog,
 184                           const char *fmt, ...)
 185 {
 186     va_list ap;
 187
 188     if (fplog != NULL)
 189     {
 190         /* We should only print to stderr on the master node,
 191          * in most cases fplog is only set on the master node, so this works.
 192          */
 193         va_start(ap, fmt);
 194         vfprintf(stderr, fmt, ap);
 195         va_end(ap);
 196
 197         va_start(ap, fmt);
 198         vfprintf(fplog, fmt, ap);
 199         va_end(ap);
 200     }
 201 }
 202 #endif /*HAVE_NVML*/
 203
 204 /* TODO: We should actually be using md_print_warn in md_logging.c,
 205  * but we can't include mpi.h in CUDA code.
 206  * This is replicated from nbnxn_cuda_data_mgmt.cu.
 207  */
 208 static void md_print_warn(FILE       *fplog,
 209                           const char *fmt, ...)
 210 {
 211     va_list ap;
 212
 213     if (fplog != NULL)
 214     {
 215         /* We should only print to stderr on the master node,
 216          * in most cases fplog is only set on the master node, so this works.
 217          */
 218         va_start(ap, fmt);
 219         fprintf(stderr, "\n");
 220         vfprintf(stderr, fmt, ap);
 221         fprintf(stderr, "\n");
 222         va_end(ap);
 223
 224         va_start(ap, fmt);
 225         fprintf(fplog, "\n");
 226         vfprintf(fplog, fmt, ap);
 227         fprintf(fplog, "\n");
 228         va_end(ap);
 229     }
 230 }
 231
 232 #ifdef HAVE_NVML
 233 /*! \brief Determines and adds the NVML device ID to the passed \cuda_dev.
 234  *
 235  * Determines and adds the NVML device ID to the passed \cuda_dev. This is done by
 236  * matching PCI-E information from \cuda_dev with the available NVML devices.
 237  *
 238  * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
 239  * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
 240  */
 241 static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
 242 {
 243     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 244     nvmlDevice_t nvml_device_id;
 245     unsigned int nvml_device_count = 0;
 246     cuda_dev->nvml_initialized = false;
 247     nvml_stat                  = nvmlDeviceGetCount ( &nvml_device_count );
 248     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetCount failed" );
 249     for (unsigned int nvml_device_idx = 0; nvml_stat == NVML_SUCCESS && nvml_device_idx < nvml_device_count; ++nvml_device_idx)
 250     {
 251         nvml_stat = nvmlDeviceGetHandleByIndex ( nvml_device_idx, &nvml_device_id );
 252         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetHandleByIndex failed" );
 253         if (nvml_stat != NVML_SUCCESS)
 254         {
 255             break;
 256         }
 257
 258         nvmlPciInfo_t nvml_pci_info;
 259         nvml_stat = nvmlDeviceGetPciInfo ( nvml_device_id, &nvml_pci_info );
 260         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetPciInfo failed" );
 261         if (nvml_stat != NVML_SUCCESS)
 262         {
 263             break;
 264         }
 265         if (static_cast<unsigned int>(cuda_dev->prop.pciBusID) == nvml_pci_info.bus &&
 266             static_cast<unsigned int>(cuda_dev->prop.pciDeviceID) == nvml_pci_info.device &&
 267             static_cast<unsigned int>(cuda_dev->prop.pciDomainID) == nvml_pci_info.domain)
 268         {
 269             cuda_dev->nvml_initialized = true;
 270             cuda_dev->nvml_device_id   = nvml_device_id;
 271             break;
 272         }
 273     }
 274     return cuda_dev->nvml_initialized;
 275 }
 276 #endif /*HAVE_NVML*/
 277
 278 /*! \brief Tries to set application clocks for the GPU with the given index.
 279  *
 280  * The variable \gpuid is the index of the GPU in the gpu_info.cuda_dev array
 281  * to handle the application clocks for. Application clocks are set to the
 282  * max supported value to increase performance if application clock permissions
 283  * allow this. For future GPU architectures a more sophisticated scheme might be
 284  * required.
 285  *
 286  * \param[out] fplog        log file to write to
 287  * \param[in] gpuid         index of the GPU to set application clocks for
 288  * \param[in] gpu_info      GPU info of all detected devices in the system.
 289  * \returns                 true if no error occurs during application clocks handling.
 290  */
 291 static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
 292 {
 293     const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
 294     int                   cuda_version_number         = prop->major * 10 + prop->minor;
 295     gmx_bool              bGpuCanUseApplicationClocks =
 296         ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
 297          (0 == gmx_wcmatch("*Quadro*", prop->name) && cuda_version_number >= 52 ));
 298 #ifndef HAVE_NVML
 299     if (bGpuCanUseApplicationClocks)
 300     {
 301         int cuda_driver  = 0;
 302         int cuda_runtime = 0;
 303         cudaDriverGetVersion(&cuda_driver);
 304         cudaRuntimeGetVersion(&cuda_runtime);
 305         md_print_warn( fplog, "Note: NVML support was not found (CUDA runtime %d.%d, driver %d.%d), so your\n"
 306                        "      %s GPU cannot use application clock support to improve performance.\n",
 307                        cuda_runtime/1000, cuda_runtime%100,
 308                        cuda_driver/1000, cuda_driver%100,
 309                        prop->name );
 310     }
 311     return true;
 312 #else /* HAVE_NVML defined */
 313     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 314     char        *env;
 315     if (!bGpuCanUseApplicationClocks)
 316     {
 317         return true;
 318     }
 319     //TODO: GMX_GPU_APPLICATION_CLOCKS is currently only used to enable/disable setting of application clocks
 320     //      this variable can be later used to give a user more fine grained control.
 321     env = getenv("GMX_GPU_APPLICATION_CLOCKS");
 322     if (env != NULL && ( strcmp( env, "0") == 0 ||
 323                          gmx_strcasecmp( env, "OFF") == 0 ||
 324                          gmx_strcasecmp( env, "DISABLE") == 0 ))
 325     {
 326         return true;
 327     }
 328     nvml_stat = nvmlInit();
 329     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlInit failed." );
 330     if (nvml_stat != NVML_SUCCESS)
 331     {
 332         return false;
 333     }
 334     if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
 335     {
 336         return false;
 337     }
 338     //get current application clocks setting
 339     unsigned int app_sm_clock  = 0;
 340     unsigned int app_mem_clock = 0;
 341     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
 342     if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
 343     {
 344         return false;
 345     }
 346     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 347     nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
 348     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 349     //get max application clocks
 350     unsigned int max_sm_clock  = 0;
 351     unsigned int max_mem_clock = 0;
 352     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
 353     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 354     nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
 355     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
 356
 357     gpu_info->gpu_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
 358     gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
 359
 360     nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
 361     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
 362
 363     //TODO: Need to distinguish between different type of GPUs might be necessary in the future, e.g. if max application clocks should not be used
 364     //      for certain GPUs.
 365     if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
 366     {
 367         //TODO: Maybe need to think about something more user friendly here.
 368         md_print_info( fplog, "Changing GPU clock rates by setting application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
 369         nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
 370         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
 371         gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
 372     }
 373     else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
 374     {
 375         //TODO: Maybe need to think about something more user friendly here.
 376         md_print_warn( fplog, "Not possible to change GPU clocks to optimal value because of insufficient permissions to set application clocks for %s. Current values are (%d,%d). Max values are (%d,%d)\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clock permissions.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
 377     }
 378     else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
 379     {
 380         //TODO: This should probably be integrated into the GPU Properties table.
 381         md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
 382     }
 383     else
 384     {
 385         //TODO: Maybe need to think about something more user friendly here.
 386         md_print_warn( fplog,  "Not possible to change GPU clocks to optimal value because application clocks handling failed with NVML error (%d): %s.\n", nvml_stat, nvmlErrorString(nvml_stat));
 387     }
 388     return (nvml_stat == NVML_SUCCESS);
 389 #endif /*HAVE_NVML*/
 390 }
 391
 392 /*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
 393  *
 394  * \param[in] gpu_dev  CUDA device information
 395  */
 396 static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
 397 {
 398 #ifndef HAVE_NVML
 399     GMX_UNUSED_VALUE(cuda_dev);
 400     return true;
 401 #else
 402     nvmlReturn_t nvml_stat = NVML_SUCCESS;
 403     if (cuda_dev &&
 404         cuda_dev->nvml_is_restricted == NVML_FEATURE_DISABLED &&
 405         cuda_dev->nvml_ap_clocks_changed)
 406     {
 407         nvml_stat = nvmlDeviceResetApplicationsClocks( cuda_dev->nvml_device_id );
 408         HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceResetApplicationsClocks failed" );
 409     }
 410     nvml_stat = nvmlShutdown();
 411     HANDLE_NVML_RET_ERR( nvml_stat, "nvmlShutdown failed" );
 412     return (nvml_stat == NVML_SUCCESS);
 413 #endif /*HAVE_NVML*/
 414 }
 415
 416 gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
 417                   const gmx_gpu_info_t *gpu_info,
 418                   const gmx_gpu_opt_t *gpu_opt)
 419 {
 420     cudaError_t stat;
 421     char        sbuf[STRLEN];
 422     int         gpuid;
 423
 424     assert(gpu_info);
 425     assert(result_str);
 426
 427     if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
 428     {
 429         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 430                 "there are %d %s-selected GPU(s), but #%d was requested.",
 431                 gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 432         gmx_incons(sbuf);
 433     }
 434
 435     gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
 436
 437     stat = cudaSetDevice(gpuid);
 438     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 439
 440     if (debug)
 441     {
 442         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
 443     }
 444
 445     //Ignoring return value as NVML errors should be treated not critical.
 446     if (stat == cudaSuccess)
 447     {
 448         init_gpu_application_clocks(fplog, gpuid, gpu_info);
 449     }
 450     return (stat == cudaSuccess);
 451 }
 452
 453 gmx_bool free_cuda_gpu(
 454         int gmx_unused mygpu, char *result_str,
 455         const gmx_gpu_info_t gmx_unused *gpu_info,
 456         const gmx_gpu_opt_t gmx_unused *gpu_opt
 457         )
 458 {
 459     cudaError_t  stat;
 460     gmx_bool     reset_gpu_application_clocks_status = true;
 461     int          gpuid;
 462
 463     assert(result_str);
 464
 465     if (debug)
 466     {
 467         int gpuid;
 468         stat = cudaGetDevice(&gpuid);
 469         CU_RET_ERR(stat, "cudaGetDevice failed");
 470         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 471     }
 472
 473     gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
 474     if (gpuid != -1)
 475     {
 476         reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
 477     }
 478
 479     stat = cudaDeviceReset();
 480     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 481     return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
 482 }
 483
 484 /*! \brief Returns true if the gpu characterized by the device properties is
 485  *  supported by the native gpu acceleration.
 486  *
 487  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 488  * \returns             true if the GPU properties passed indicate a compatible
 489  *                      GPU, otherwise false.
 490  */
 491 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 492 {
 493     return (dev_prop->major >= 2);
 494 }
 495
 496 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 497  *
 498  * \param[in] stat  GPU status.
 499  * \returns         true if the provided status is egpuCompatible, otherwise false.
 500  */
 501 static bool is_compatible_gpu(int stat)
 502 {
 503     return (stat == egpuCompatible);
 504 }
 505
 506 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 507  *
 508  *  Returns a status value which indicates compatibility or one of the following
 509  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 510  *  It also returns the respective device's properties in \dev_prop (if applicable).
 511  *
 512  *  \param[in]  dev_id   the ID of the GPU to check.
 513  *  \param[out] dev_prop the CUDA device properties of the device checked.
 514  *  \returns             the status of the requested device
 515  */
 516 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 517 {
 518     cudaError_t stat;
 519     int         ndev;
 520
 521     stat = cudaGetDeviceCount(&ndev);
 522     if (stat != cudaSuccess)
 523     {
 524         return egpuInsane;
 525     }
 526
 527     if (dev_id > ndev - 1)
 528     {
 529         return egpuNonexistent;
 530     }
 531
 532     /* TODO: currently we do not make a distinction between the type of errors
 533      * that can appear during sanity checks. This needs to be improved, e.g if
 534      * the dummy test kernel fails to execute with a "device busy message" we
 535      * should appropriately report that the device is busy instead of insane.
 536      */
 537     if (do_sanity_checks(dev_id, dev_prop) == 0)
 538     {
 539         if (is_gmx_supported_gpu(dev_prop))
 540         {
 541             return egpuCompatible;
 542         }
 543         else
 544         {
 545             return egpuIncompatible;
 546         }
 547     }
 548     else
 549     {
 550         return egpuInsane;
 551     }
 552 }
 553
 554
 555 int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 556 {
 557     int                i, ndev, checkres, retval;
 558     cudaError_t        stat;
 559     cudaDeviceProp     prop;
 560     gmx_device_info_t *devs;
 561
 562     assert(gpu_info);
 563     assert(err_str);
 564
 565     gpu_info->n_dev_compatible = 0;
 566
 567     ndev    = 0;
 568     devs    = NULL;
 569
 570     stat = cudaGetDeviceCount(&ndev);
 571     if (stat != cudaSuccess)
 572     {
 573         const char *s;
 574
 575         /* cudaGetDeviceCount failed which means that there is something
 576          * wrong with the machine: driver-runtime mismatch, all GPUs being
 577          * busy in exclusive mode, or some other condition which should
 578          * result in us issuing a warning a falling back to CPUs. */
 579         retval = -1;
 580         s      = cudaGetErrorString(stat);
 581         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 582     }
 583     else
 584     {
 585         snew(devs, ndev);
 586         for (i = 0; i < ndev; i++)
 587         {
 588             checkres = is_gmx_supported_gpu_id(i, &prop);
 589
 590             devs[i].id   = i;
 591             devs[i].prop = prop;
 592             devs[i].stat = checkres;
 593
 594             if (checkres == egpuCompatible)
 595             {
 596                 gpu_info->n_dev_compatible++;
 597             }
 598         }
 599         retval = 0;
 600     }
 601
 602     gpu_info->n_dev   = ndev;
 603     gpu_info->gpu_dev = devs;
 604
 605     return retval;
 606 }
 607
 608 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 609                           gmx_gpu_opt_t        *gpu_opt)
 610 {
 611     int  i, ncompat;
 612     int *compat;
 613
 614     assert(gpu_info);
 615     /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
 616     assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
 617
 618     snew(compat, gpu_info->n_dev);
 619     ncompat = 0;
 620     for (i = 0; i < gpu_info->n_dev; i++)
 621     {
 622         if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
 623         {
 624             ncompat++;
 625             compat[ncompat - 1] = i;
 626         }
 627     }
 628
 629     gpu_opt->n_dev_compatible = ncompat;
 630     snew(gpu_opt->dev_compatible, ncompat);
 631     memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
 632     sfree(compat);
 633 }
 634
 635 gmx_bool check_selected_gpus(int                  *checkres,
 636                              const gmx_gpu_info_t *gpu_info,
 637                              gmx_gpu_opt_t        *gpu_opt)
 638 {
 639     int  i, id;
 640     bool bAllOk;
 641
 642     assert(checkres);
 643     assert(gpu_info);
 644     assert(gpu_opt->n_dev_use >= 0);
 645
 646     if (gpu_opt->n_dev_use == 0)
 647     {
 648         return TRUE;
 649     }
 650
 651     assert(gpu_opt->dev_use);
 652
 653     /* we will assume that all GPUs requested are valid IDs,
 654        otherwise we'll bail anyways */
 655
 656     bAllOk = true;
 657     for (i = 0; i < gpu_opt->n_dev_use; i++)
 658     {
 659         id = gpu_opt->dev_use[i];
 660
 661         /* devices are stored in increasing order of IDs in gpu_dev */
 662         gpu_opt->dev_use[i] = id;
 663
 664         checkres[i] = (id >= gpu_info->n_dev) ?
 665             egpuNonexistent : gpu_info->gpu_dev[id].stat;
 666
 667         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 668     }
 669
 670     return bAllOk;
 671 }
 672
 673 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 674 {
 675     if (gpu_info == NULL)
 676     {
 677         return;
 678     }
 679
 680     sfree(gpu_info->gpu_dev);
 681 }
 682
 683 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 684 {
 685     assert(s);
 686     assert(gpu_info);
 687
 688     if (index < 0 && index >= gpu_info->n_dev)
 689     {
 690         return;
 691     }
 692
 693     gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
 694
 695     bool               bGpuExists =
 696         dinfo->stat == egpuCompatible ||
 697         dinfo->stat == egpuIncompatible;
 698
 699     if (!bGpuExists)
 700     {
 701         sprintf(s, "#%d: %s, stat: %s",
 702                 dinfo->id, "N/A",
 703                 gpu_detect_res_str[dinfo->stat]);
 704     }
 705     else
 706     {
 707         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 708                 dinfo->id, dinfo->prop.name,
 709                 dinfo->prop.major, dinfo->prop.minor,
 710                 dinfo->prop.ECCEnabled ? "yes" : " no",
 711                 gpu_detect_res_str[dinfo->stat]);
 712     }
 713 }
 714
 715 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 716                       const gmx_gpu_opt_t  *gpu_opt,
 717                       int                   idx)
 718 {
 719     assert(gpu_info);
 720     assert(gpu_opt);
 721     assert(idx >= 0 && idx < gpu_opt->n_dev_use);
 722
 723     return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
 724 }
 725
 726 int get_current_cuda_gpu_device_id(void)
 727 {
 728     int gpuid;
 729     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 730
 731     return gpuid;
 732 }
 733
 734 size_t sizeof_gpu_dev_info(void)
 735 {
 736     return sizeof(gmx_device_info_t);
 737 }
 738
 739 void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
 740                                   gmx_host_alloc_t **nb_alloc,
 741                                   gmx_host_free_t  **nb_free)
 742 {
 743     if (bUseGpuKernels)
 744     {
 745         *nb_alloc = &pmalloc;
 746         *nb_free  = &pfree;
 747     }
 748     else
 749     {
 750         *nb_alloc = NULL;
 751         *nb_free  = NULL;
 752     }
 753 }