src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2010, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "smalloc.h"
  41 #include "string2.h"
  42 #include "types/hw_info.h"
  43
  44 #include "gpu_utils.h"
  45 #include "../cuda_tools/cudautils.cuh"
  46 #include "memtestG80_core.h"
  47
  48
  49 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
  50 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
  51                                                                             to run in quick memtest. */
  52 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
  53
  54 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
  55 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
  56
  57 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
  58                                                                             run in time constrained memtest. */
  59
  60 /*! Number of supported GPUs */
  61 #define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
  62
  63 static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
  64                                            In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
  65
  66 /*! Dummy kernel used for sanity checking. */
  67 __global__ void k_dummy_test(){}
  68
  69
  70 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
  71 enum memtest_G80_test_types {
  72     MOVING_INVERSIONS_10 =      0x1,
  73     MOVING_INVERSIONS_RAND =    0x2,
  74     WALKING_8BIT_M86 =          0x4,
  75     WALKING_0_8BIT =            0x8,
  76     WALKING_1_8BIT =            0x10,
  77     WALKING_0_32BIT =           0x20,
  78     WALKING_1_32BIT =           0x40,
  79     RANDOM_BLOCKS =             0x80,
  80     MOD_20_32BIT =              0x100,
  81     LOGIC_1_ITER =              0x200,
  82     LOGIC_4_ITER =              0x400,
  83     LOGIC_1_ITER_SHMEM =        0x800,
  84     LOGIC_4_ITER_SHMEM =        0x1000
  85 };
  86
  87 // TODO put this list into an external file and include it so that the list is easily accessible
  88 /*! List of supported GPUs. */
  89 static const char * const SupportedGPUs[] = {
  90     /* GT400 */
  91     "Geforce GTX 480",
  92     "Geforce GTX 470",
  93     "Geforce GTX 465",
  94     "Geforce GTX 460",
  95
  96     "Tesla C2070",
  97     "Tesla C2050",
  98     "Tesla S2070",
  99     "Tesla S2050",
 100     "Tesla M2070",
 101     "Tesla M2050",
 102
 103     "Quadro 5000",
 104     "Quadro 6000",
 105
 106     /* GT200 */
 107     "Geforce GTX 295",
 108     "Geforce GTX 285",
 109     "Geforce GTX 280",
 110     "Geforce GTX 275",
 111     "Geforce GTX 260",
 112     "GeForce GTS 250",
 113     "GeForce GTS 150",
 114
 115     "GeForce GTX 285M",
 116     "GeForce GTX 280M",
 117
 118     "Tesla S1070",
 119     "Tesla C1060",
 120     "Tesla M1060",
 121
 122     "Quadro FX 5800",
 123     "Quadro FX 4800",
 124     "Quadro CX",
 125     "Quadro Plex 2200 D2",
 126     "Quadro Plex 2200 S4",
 127
 128     /* G90 */
 129     "GeForce 9800 G", /* GX2, GTX, GTX+, GT */
 130     "GeForce 9800M GTX",
 131
 132     "Quadro FX 4700",
 133     "Quadro Plex 2100 D4"
 134 };
 135
 136
 137 /*!
 138   * \brief Runs GPU sanity checks.
 139   *
 140   * Runs a series of checks to determine that the given GPU and underlying CUDA
 141   * driver/runtime functions properly.
 142   * Returns properties of a device with given ID or the one that has
 143   * already been initialized earlier in the case if of \dev_id == -1.
 144   *
 145   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 146   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 147   * \returns                0 if the device looks OK
 148   */
 149 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 150 {
 151     cudaError_t cu_err;
 152     int         dev_count, id;
 153
 154     cu_err = cudaGetDeviceCount(&dev_count);
 155     if (cu_err != cudaSuccess)
 156     {
 157        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 158                cudaGetErrorString(cu_err));
 159         return -1;
 160     }
 161
 162     /* no CUDA compatible device at all */
 163     if (dev_count == 0)
 164         return -1;
 165
 166     /* things might go horribly wrong if cudart is not compatible with the driver */
 167     if (dev_count < 0 || dev_count > cuda_max_device_count)
 168         return -1;
 169
 170     if (dev_id == -1) /* device already selected let's not destroy the context */
 171     {
 172         cu_err = cudaGetDevice(&id);
 173         if (cu_err != cudaSuccess)
 174         {
 175             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 176                     cudaGetErrorString(cu_err));
 177             return -1;
 178         }
 179     }
 180     else
 181     {
 182         id = dev_id;
 183         if (id > dev_count - 1) /* pfff there's no such device */
 184         {
 185             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 186                     dev_id, dev_count);
 187             return -1;
 188         }
 189     }
 190
 191     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 192     cu_err = cudaGetDeviceProperties(dev_prop, id);
 193     if (cu_err != cudaSuccess)
 194     {
 195         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 196                 cudaGetErrorString(cu_err));
 197         return -1;
 198     }
 199
 200     /* both major & minor is 9999 if no CUDA capable devices are present */
 201     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 202         return -1;
 203     /* we don't care about emulation mode */
 204     if (dev_prop->major == 0)
 205         return -1;
 206
 207     if (id != -1)
 208     {
 209         cu_err = cudaSetDevice(id);
 210         if (cu_err != cudaSuccess)
 211         {
 212             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 213                     cu_err, id, cudaGetErrorString(cu_err));
 214             return -1;
 215         }
 216     }
 217
 218     /* try to execute a dummy kernel */
 219     k_dummy_test<<<1, 512>>>();
 220     CU_LAUNCH_ERR_SYNC("dummy test kernel");
 221
 222     /* destroy context if we created one */
 223     if (id != -1)
 224     {
 225 #if CUDA_VERSION < 4000
 226         cu_err = cudaThreadExit();
 227         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 228 #else
 229         cu_err = cudaDeviceReset();
 230         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 231 #endif
 232     }
 233
 234     return 0;
 235 }
 236
 237
 238 /*!
 239  * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
 240  *
 241  * \param[in] gpu_name  the name of the CUDA device
 242  * \returns             TRUE if the device is supported, otherwise FALSE
 243  */
 244 static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
 245 {
 246     size_t i;
 247     for (i = 0; i < NB_GPUS; i++)
 248     {
 249         trim(gpuName);
 250         if (gmx_strncasecmp(gpuName, SupportedGPUs[i], strlen(SupportedGPUs[i])) == 0)
 251             return 1;
 252     }
 253     return 0;
 254 }
 255
 256 /*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
 257  *
 258  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 259  * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set.
 260  * \returns             TRUE if the device is supported, otherwise FALSE
 261  *
 262  */
 263 gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
 264 {
 265     cudaDeviceProp dev_prop;
 266
 267     if (debug) fprintf(debug, "Checking compatibility with device #%d, %s\n", dev_id, gpu_name);
 268
 269     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 270         return -1;
 271
 272     if (gpu_name != NULL)
 273     {
 274         strcpy(gpu_name, dev_prop.name);
 275     }
 276     return is_gmx_openmm_supported_gpu_name(dev_prop.name);
 277 }
 278
 279
 280 /*!
 281  * \brief Runs a set of memory tests specified by the given bit-flags.
 282  * Tries to allocate and do the test on \p megs Mb memory or
 283  * the greatest amount that can be allocated (>10Mb).
 284  * In case if an error is detected it stops without finishing the remaining
 285  * steps/iterations and returns greater then zero value.
 286  * In case of other errors (e.g. kernel launch errors, device querying errors)
 287  * -1 is returned.
 288  *
 289  * \param[in] which_tests   variable with bit-flags of the requested tests
 290  * \param[in] megs          amount of memory that will be tested in MB
 291  * \param[in] iter          number of iterations
 292  * \returns                 0 if no error was detected, otherwise >0
 293  */
 294 static int do_memtest(unsigned int which_tests, int megs, int iter)
 295 {
 296     memtestState    tester;
 297     int             i;
 298     uint            err_count; //, err_iter;
 299
 300     // no parameter check as this fn won't be called externally
 301
 302     // let's try to allocate the mem
 303     while (!tester.allocate(megs) && (megs - 10 > 0))
 304         { megs -= 10; tester.deallocate(); }
 305
 306     if (megs <= 10)
 307     {
 308         fprintf(stderr, "Unable to allocate GPU memory!\n");
 309         return -1;
 310     }
 311
 312     // clear the first 18 bits
 313     which_tests &= 0x3FFF;
 314     for (i = 0; i < iter; i++)
 315     {
 316         // Moving Inversions (ones and zeros)
 317         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 318         {
 319             tester.gpuMovingInversionsOnesZeros(err_count);
 320             if (err_count > 0)
 321                 return MOVING_INVERSIONS_10;
 322         }
 323         // Moving Inversions (random)
 324         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 325         {
 326             tester.gpuMovingInversionsRandom(err_count);
 327             if (err_count > 0)
 328                 return MOVING_INVERSIONS_RAND;
 329         }
 330        // Memtest86 Walking 8-bit
 331         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 332         {
 333             for (uint shift = 0; shift < 8; shift++)
 334             {
 335                 tester.gpuWalking8BitM86(err_count, shift);
 336                 if (err_count > 0)
 337                     return WALKING_8BIT_M86;
 338             }
 339       }
 340         // True Walking zeros (8-bit)
 341         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 342         {
 343             for (uint shift = 0; shift < 8; shift++)
 344             {
 345                 tester.gpuWalking8Bit(err_count, false, shift);
 346                 if (err_count > 0)
 347                     return WALKING_0_8BIT;
 348             }
 349         }
 350         // True Walking ones (8-bit)
 351         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 352         {
 353             for (uint shift = 0; shift < 8; shift++)
 354             {
 355                 tester.gpuWalking8Bit(err_count, true, shift);
 356                 if (err_count > 0)
 357                     return WALKING_1_8BIT;
 358             }
 359         }
 360         // Memtest86 Walking zeros (32-bit)
 361         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 362         {
 363             for (uint shift = 0; shift < 32; shift++)
 364             {
 365                 tester.gpuWalking32Bit(err_count, false, shift);
 366                 if (err_count > 0)
 367                     return WALKING_0_32BIT;
 368             }
 369         }
 370        // Memtest86 Walking ones (32-bit)
 371         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 372         {
 373             for (uint shift = 0; shift < 32; shift++)
 374             {
 375                 tester.gpuWalking32Bit(err_count, true, shift);
 376                 if (err_count > 0)
 377                     return WALKING_1_32BIT;
 378             }
 379        }
 380         // Random blocks
 381         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 382         {
 383             tester.gpuRandomBlocks(err_count,rand());
 384             if (err_count > 0)
 385                 return RANDOM_BLOCKS;
 386
 387         }
 388
 389         // Memtest86 Modulo-20
 390         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 391         {
 392             for (uint shift = 0; shift < 20; shift++)
 393             {
 394                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 395                 if (err_count > 0)
 396                     return MOD_20_32BIT;
 397             }
 398         }
 399         // Logic (one iteration)
 400         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 401         {
 402             tester.gpuShortLCG0(err_count,1);
 403             if (err_count > 0)
 404                 return LOGIC_1_ITER;
 405         }
 406         // Logic (4 iterations)
 407         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 408         {
 409             tester.gpuShortLCG0(err_count,4);
 410             if (err_count > 0)
 411                 return LOGIC_4_ITER;
 412
 413         }
 414         // Logic (shared memory, one iteration)
 415         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 416         {
 417             tester.gpuShortLCG0Shmem(err_count,1);
 418             if (err_count > 0)
 419                 return LOGIC_1_ITER_SHMEM;
 420         }
 421         // Logic (shared-memory, 4 iterations)
 422         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 423         {
 424             tester.gpuShortLCG0Shmem(err_count,4);
 425             if (err_count > 0)
 426                 return LOGIC_4_ITER_SHMEM;
 427         }
 428     }
 429
 430     tester.deallocate();
 431     return err_count;
 432 }
 433
 434 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 435  * If an error is detected it stops before completing the test and returns a
 436  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 437  * device querying errors) -1 is returned.
 438  *
 439  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 440  * \returns             0 if no error was detected, otherwise >0
 441  */
 442 int do_quick_memtest(int dev_id)
 443 {
 444     cudaDeviceProp  dev_prop;
 445     int             devmem, res, time=0;
 446
 447     if (debug) { time = getTimeMilliseconds(); }
 448
 449     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 450     {
 451         // something went wrong
 452         return -1;
 453     }
 454
 455     if (debug)
 456     {
 457         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 458         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 459             QUICK_MEM, devmem, QUICK_ITER);
 460     }
 461
 462     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 463
 464     if (debug)
 465     {
 466         fprintf(debug, "Q-RES = %d\n", res);
 467         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 468     }
 469
 470     /* destroy context only if we created it */
 471     if (dev_id !=-1) cudaThreadExit();
 472     return res;
 473 }
 474
 475 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 476  * If an error is detected  it stops before completing the test and returns a
 477  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 478  * device querying errors) -1 is returned.
 479  *
 480  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 481  * \returns             0 if no error was detected, otherwise >0
 482  */
 483
 484 int do_full_memtest(int dev_id)
 485 {
 486     cudaDeviceProp  dev_prop;
 487     int             devmem, res, time=0;
 488
 489     if (debug) { time = getTimeMilliseconds(); }
 490
 491     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 492     {
 493         // something went wrong
 494         return -1;
 495     }
 496
 497     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 498
 499     if (debug)
 500     {
 501         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 502             devmem, devmem, FULL_ITER);
 503     }
 504
 505     /* do all test on the entire memory */
 506     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 507
 508     if (debug)
 509     {
 510         fprintf(debug, "F-RES = %d\n", res);
 511         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 512     }
 513
 514     /* destroy context only if we created it */
 515     if (dev_id != -1) cudaThreadExit();
 516     return res;
 517 }
 518
 519 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 520  * If an error is detected it stops before completing the test and returns a value greater
 521  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 522  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 523  * the test will always be multipple of one iteration's runtime.
 524  *
 525  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 526  * \param[in] time_constr   the time limit of the testing
 527  * \returns                 0 if no error was detected, otherwise >0
 528  */
 529 int do_timed_memtest(int dev_id, int time_constr)
 530 {
 531     cudaDeviceProp  dev_prop;
 532     int             devmem, res=0, time=0, startt;
 533
 534     if (debug) { time = getTimeMilliseconds(); }
 535
 536     time_constr *= 1000;  /* convert to ms for convenience */
 537     startt = getTimeMilliseconds();
 538
 539     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 540     {
 541         // something went wrong
 542         return -1;
 543     }
 544
 545     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 546
 547     if (debug)
 548     {
 549         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 550         devmem, devmem, time_constr);
 551     }
 552
 553     /* do the TIMED_TESTS set, one step at a time on the entire memory
 554        that can be allocated, and stop when the given time is exceeded */
 555     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 556     {
 557         res = do_memtest(TIMED_TESTS, devmem, 1);
 558         if (res != 0) break;
 559     }
 560
 561     if (debug)
 562     {
 563         fprintf(debug, "T-RES = %d\n", res);
 564         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 565     }
 566
 567     /* destroy context only if we created it */
 568     if (dev_id != -1) cudaThreadExit();
 569     return res;
 570 }
 571
 572 /*! \brief Initializes the GPU with the given index.
 573  *
 574  * The varible \mygpu is the index of the GPU to initialize in the
 575  * gpu_info.cuda_dev array.
 576  *
 577  * \param[in]  mygpu        index of the GPU to initialize
 578  * \param[out] result_str   the message related to the error that occurred
 579  *                          during the initialization (if there was any).
 580  * \param[in] gpu_info      GPU info of all detected devices in the system.
 581  * \returns                 true if no error occurs during initialization.
 582  */
 583 gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 584 {
 585     cudaError_t stat;
 586     char sbuf[STRLEN];
 587     int gpuid;
 588
 589     assert(gpu_info);
 590     assert(result_str);
 591
 592     if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
 593     {
 594         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 595                 "there are %d %s-selected GPU(s), but #%d was requested.",
 596                  gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
 597         gmx_incons(sbuf);
 598     }
 599
 600     gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 601
 602     stat = cudaSetDevice(gpuid);
 603     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 604
 605     if (debug)
 606     {
 607         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 608     }
 609
 610     return (stat == cudaSuccess);
 611 }
 612
 613 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 614  *
 615  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 616  * is lost. This should only be called when none of this data is required anymore.
 617  *
 618  * \param[out] result_str   the message related to the error that occurred
 619  *                          during the initialization (if there was any).
 620  * \returns                 true if no error occurs during the freeing.
 621  */
 622 gmx_bool free_gpu(char *result_str)
 623 {
 624     cudaError_t stat;
 625
 626     assert(result_str);
 627
 628     if (debug)
 629     {
 630         int gpuid;
 631         stat = cudaGetDevice(&gpuid);
 632         CU_RET_ERR(stat, "cudaGetDevice failed");
 633         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 634     }
 635
 636 #if CUDA_VERSION < 4000
 637     stat = cudaThreadExit();
 638 #else
 639     stat = cudaDeviceReset();
 640 #endif
 641     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 642
 643     return (stat == cudaSuccess);
 644 }
 645
 646 /*! \brief Returns true if the gpu characterized by the device properties is
 647  *  supported by the native gpu acceleration.
 648  *
 649  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 650  * \returns             true if the GPU properties passed indicate a compatible
 651  *                      GPU, otherwise false.
 652  */
 653 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 654 {
 655     return (dev_prop->major >= 2);
 656 }
 657
 658 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 659  *
 660  * \param[in] stat  GPU status.
 661  * \returns         true if the provided status is egpuCompatible, otherwise false.
 662  */
 663 static bool is_compatible_gpu(int stat)
 664 {
 665     return (stat == egpuCompatible);
 666 }
 667
 668 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 669  *
 670  *  Returns a status value which indicates compatibility or one of the following
 671  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 672  *  It also returns the respective device's properties in \dev_prop (if applicable).
 673  *
 674  *  \param[in]  dev_id   the ID of the GPU to check.
 675  *  \param[out] dev_prop the CUDA device properties of the device checked.
 676  *  \returns             the status of the requested device
 677  */
 678 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 679 {
 680     cudaError_t stat;
 681     int         ndev;
 682
 683     stat = cudaGetDeviceCount(&ndev);
 684     CU_RET_ERR(stat, "cudaGetDeviceCount failed");
 685
 686     if (dev_id > ndev - 1)
 687     {
 688         return egpuNonexistent;
 689     }
 690
 691     if (do_sanity_checks(dev_id, dev_prop) == 0)
 692     {
 693         if (is_gmx_supported_gpu(dev_prop))
 694         {
 695             return egpuCompatible;
 696         }
 697         else
 698         {
 699             return egpuIncompatible;
 700         }
 701     }
 702     else
 703     {
 704         return egpuInsane;
 705     }
 706 }
 707
 708
 709 /*! \brief Detect all NVIDIA GPUs in the system.
 710  *
 711  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 712  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 713  *  with the required information on each the device: ID, device properties,
 714  *  status.
 715  *
 716  *  \param[in] gpu_info    pointer to structure holding GPU information.
 717  */
 718 void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
 719 {
 720     int             i, ndev, checkres;
 721     cudaError_t     stat;
 722     cudaDeviceProp  prop;
 723     cuda_dev_info_t *devs;
 724
 725     assert(gpu_info);
 726
 727     stat = cudaGetDeviceCount(&ndev);
 728     CU_RET_ERR(stat, "cudaGetDeviceCount failed");
 729
 730     snew(devs, ndev);
 731     for (i = 0; i < ndev; i++)
 732     {
 733         checkres = is_gmx_supported_gpu_id(i, &prop);
 734
 735         devs[i].id   = i;
 736         devs[i].prop = prop;
 737         devs[i].stat = checkres;
 738     }
 739
 740     gpu_info->ncuda_dev = ndev;
 741     gpu_info->cuda_dev  = devs;
 742 }
 743
 744 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 745  *
 746  * This function selects the compatible gpus and initializes
 747  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 748  *
 749  * Given the list of GPUs available in the system the it checks each gpu in
 750  * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
 751  * the compatible ones into cuda_dev_use with this marking the respective
 752  * GPUs as "available for use."
 753  * Note that \detect_cuda_gpus must have been called before.
 754  *
 755  * \param[in]    gpu_info    pointer to structure holding GPU information
 756  */
 757 void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 758 {
 759     int i, ncompat;
 760     int *compat;
 761
 762     assert(gpu_info);
 763     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 764     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 765
 766     snew(compat, gpu_info->ncuda_dev);
 767     ncompat = 0;
 768     for (i = 0; i < gpu_info->ncuda_dev; i++)
 769     {
 770         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 771         {
 772             ncompat++;
 773             compat[ncompat - 1] = i;
 774         }
 775     }
 776
 777     gpu_info->ncuda_dev_use = ncompat;
 778     snew(gpu_info->cuda_dev_use, ncompat);
 779     memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
 780     sfree(compat);
 781 }
 782
 783 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 784  *
 785  * Given the a list of GPU devide IDs in \requested_devs, check for the
 786  * existence and compatibility of the respective GPUs and fill in \gpu_info
 787  * with the collected information. Also provide the caller with an array with
 788  * the result of checks in \checkres.
 789  *
 790  * \param[out]  checkres    check result for each ID passed in \requested_devs
 791  * \param[in]   gpu_info    pointer to structure holding GPU information
 792  * \param[in]   requested_devs array of requested device IDs
 793  * \param[in]   count       number of IDs in \requested_devs
 794  * \returns                 TRUE if every requested GPU is compatible
 795  */
 796 gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 797                                 const int *requested_devs, int count)
 798 {
 799     int i, id;
 800     bool bAllOk;
 801
 802     assert(checkres);
 803     assert(gpu_info);
 804     assert(requested_devs);
 805     assert(count >= 0);
 806
 807     if (count == 0)
 808     {
 809         return TRUE;
 810     }
 811
 812     /* we will assume that all GPUs requested are valid IDs,
 813        otherwise we'll bail anyways */
 814     gpu_info->ncuda_dev_use = count;
 815     snew(gpu_info->cuda_dev_use, count);
 816
 817     bAllOk = true;
 818     for (i = 0; i < count; i++)
 819     {
 820         id = requested_devs[i];
 821
 822         /* devices are stored in increasing order of IDs in cuda_dev */
 823         gpu_info->cuda_dev_use[i] = id;
 824
 825         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 826             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 827
 828         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 829     }
 830
 831     return bAllOk;
 832 }
 833
 834 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 835  *
 836  * \param[in]    gpu_info    pointer to structure holding GPU information
 837  */
 838 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 839 {
 840     if (gpu_info == NULL)
 841     {
 842         return;
 843     }
 844
 845     sfree(gpu_info->cuda_dev_use);
 846     sfree(gpu_info->cuda_dev);
 847 }
 848
 849 /*! \brief Formats and returns a device information string for a given GPU.
 850  *
 851  * Given an index *directly* into the array of available GPUs (cuda_dev)
 852  * returns a formatted info string for the respective GPU which includes
 853  * ID, name, compute capability, and detection status.
 854  *
 855  * \param[out]  s           pointer to output string (has to be allocated externally)
 856  * \param[in]   gpu_info    pointer to structure holding GPU information
 857  * \param[in]   index       an index *directly* into the array of available GPUs
 858  */
 859 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 860 {
 861     assert(s);
 862     assert(gpu_info);
 863
 864     if (index < 0 && index >= gpu_info->ncuda_dev)
 865     {
 866         return;
 867     }
 868
 869     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 870
 871     bool bGpuExists =
 872         dinfo->stat == egpuCompatible ||
 873         dinfo->stat == egpuIncompatible;
 874
 875     if (!bGpuExists)
 876     {
 877         sprintf(s, "#%d: %s, stat: %s",
 878                 dinfo->id, "N/A",
 879                 gpu_detect_res_str[dinfo->stat]);
 880     }
 881     else
 882     {
 883         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 884                 dinfo->id, dinfo->prop.name,
 885                 dinfo->prop.major, dinfo->prop.minor,
 886                 dinfo->prop.ECCEnabled ? "yes" : " no",
 887                 gpu_detect_res_str[dinfo->stat]);
 888     }
 889 }
 890
 891 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 892  *
 893  * Getter function which, given an index into the array of GPUs in use
 894  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 895  * respective CUDA GPU.
 896  *
 897  * \param[in]    gpu_info   pointer to structure holding GPU information
 898  * \param[in]    idx        index into the array of used GPUs
 899  * \returns                 device ID of the requested GPU
 900  */
 901 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
 902 {
 903     assert(gpu_info);
 904     if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
 905     {
 906         return -1;
 907     }
 908
 909     return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 910 }
 911
 912 /*! \brief Returns the device ID of the GPU currently in use.
 913  *
 914  * The GPU used is the one that is active at the time of the call in the active context.
 915  *
 916  * \param[in]    gpu_info   pointer to structure holding GPU information
 917  * \returns                 device ID of the GPU in use at the time of the call
 918  */
 919 int get_current_gpu_device_id(void)
 920 {
 921     int gpuid;
 922     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 923
 924     return gpuid;
 925 }