src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2010, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "smalloc.h"
  41 #include "string2.h"
  42 #include "types/hw_info.h"
  43
  44 #include "gpu_utils.h"
  45 #include "../cuda_tools/cudautils.cuh"
  46 #include "memtestG80_core.h"
  47
  48
  49 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
  50 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
  51                                                                             to run in quick memtest. */
  52 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
  53
  54 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
  55 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
  56
  57 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
  58                                                                             run in time constrained memtest. */
  59
  60 /*! Number of supported GPUs */
  61 #define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
  62
  63 static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
  64                                            In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
  65
  66 /*! Dummy kernel used for sanity checking. */
  67 __global__ void k_dummy_test(){}
  68
  69
  70 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
  71 enum memtest_G80_test_types {
  72     MOVING_INVERSIONS_10 =      0x1,
  73     MOVING_INVERSIONS_RAND =    0x2,
  74     WALKING_8BIT_M86 =          0x4,
  75     WALKING_0_8BIT =            0x8,
  76     WALKING_1_8BIT =            0x10,
  77     WALKING_0_32BIT =           0x20,
  78     WALKING_1_32BIT =           0x40,
  79     RANDOM_BLOCKS =             0x80,
  80     MOD_20_32BIT =              0x100,
  81     LOGIC_1_ITER =              0x200,
  82     LOGIC_4_ITER =              0x400,
  83     LOGIC_1_ITER_SHMEM =        0x800,
  84     LOGIC_4_ITER_SHMEM =        0x1000
  85 };
  86
  87 // TODO put this list into an external file and include it so that the list is easily accessible
  88 /*! List of supported GPUs. */
  89 static const char * const SupportedGPUs[] = {
  90     /* GT400 */
  91     "Geforce GTX 480",
  92     "Geforce GTX 470",
  93     "Geforce GTX 465",
  94     "Geforce GTX 460",
  95
  96     "Tesla C2070",
  97     "Tesla C2050",
  98     "Tesla S2070",
  99     "Tesla S2050",
 100     "Tesla M2070",
 101     "Tesla M2050",
 102
 103     "Quadro 5000",
 104     "Quadro 6000",
 105
 106     /* GT200 */
 107     "Geforce GTX 295",
 108     "Geforce GTX 285",
 109     "Geforce GTX 280",
 110     "Geforce GTX 275",
 111     "Geforce GTX 260",
 112     "GeForce GTS 250",
 113     "GeForce GTS 150",
 114
 115     "GeForce GTX 285M",
 116     "GeForce GTX 280M",
 117
 118     "Tesla S1070",
 119     "Tesla C1060",
 120     "Tesla M1060",
 121
 122     "Quadro FX 5800",
 123     "Quadro FX 4800",
 124     "Quadro CX",
 125     "Quadro Plex 2200 D2",
 126     "Quadro Plex 2200 S4",
 127
 128     /* G90 */
 129     "GeForce 9800 G", /* GX2, GTX, GTX+, GT */
 130     "GeForce 9800M GTX",
 131
 132     "Quadro FX 4700",
 133     "Quadro Plex 2100 D4"
 134 };
 135
 136
 137 /*!
 138   * \brief Runs GPU sanity checks.
 139   *
 140   * Runs a series of checks to determine that the given GPU and underlying CUDA
 141   * driver/runtime functions properly.
 142   * Returns properties of a device with given ID or the one that has
 143   * already been initialized earlier in the case if of \dev_id == -1.
 144   *
 145   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 146   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 147   * \returns                0 if the device looks OK
 148   *
 149   * TODO: introduce errors codes and handle errors more smoothly.
 150   */
 151 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 152 {
 153     cudaError_t cu_err;
 154     int         dev_count, id;
 155
 156     cu_err = cudaGetDeviceCount(&dev_count);
 157     if (cu_err != cudaSuccess)
 158     {
 159        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 160                cudaGetErrorString(cu_err));
 161         return -1;
 162     }
 163
 164     /* no CUDA compatible device at all */
 165     if (dev_count == 0)
 166         return -1;
 167
 168     /* things might go horribly wrong if cudart is not compatible with the driver */
 169     if (dev_count < 0 || dev_count > cuda_max_device_count)
 170         return -1;
 171
 172     if (dev_id == -1) /* device already selected let's not destroy the context */
 173     {
 174         cu_err = cudaGetDevice(&id);
 175         if (cu_err != cudaSuccess)
 176         {
 177             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 178                     cudaGetErrorString(cu_err));
 179             return -1;
 180         }
 181     }
 182     else
 183     {
 184         id = dev_id;
 185         if (id > dev_count - 1) /* pfff there's no such device */
 186         {
 187             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 188                     dev_id, dev_count);
 189             return -1;
 190         }
 191     }
 192
 193     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 194     cu_err = cudaGetDeviceProperties(dev_prop, id);
 195     if (cu_err != cudaSuccess)
 196     {
 197         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 198                 cudaGetErrorString(cu_err));
 199         return -1;
 200     }
 201
 202     /* both major & minor is 9999 if no CUDA capable devices are present */
 203     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 204         return -1;
 205     /* we don't care about emulation mode */
 206     if (dev_prop->major == 0)
 207         return -1;
 208
 209     if (id != -1)
 210     {
 211         cu_err = cudaSetDevice(id);
 212         if (cu_err != cudaSuccess)
 213         {
 214             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 215                     cu_err, id, cudaGetErrorString(cu_err));
 216             return -1;
 217         }
 218     }
 219
 220     /* try to execute a dummy kernel */
 221     k_dummy_test<<<1, 512>>>();
 222     if (cudaThreadSynchronize() != cudaSuccess)
 223     {
 224         return -1;
 225     }
 226
 227     /* destroy context if we created one */
 228     if (id != -1)
 229     {
 230 #if CUDA_VERSION < 4000
 231         cu_err = cudaThreadExit();
 232         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 233 #else
 234         cu_err = cudaDeviceReset();
 235         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 236 #endif
 237     }
 238
 239     return 0;
 240 }
 241
 242
 243 /*!
 244  * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
 245  *
 246  * \param[in] gpu_name  the name of the CUDA device
 247  * \returns             TRUE if the device is supported, otherwise FALSE
 248  */
 249 static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
 250 {
 251     size_t i;
 252     for (i = 0; i < NB_GPUS; i++)
 253     {
 254         trim(gpuName);
 255         if (gmx_strncasecmp(gpuName, SupportedGPUs[i], strlen(SupportedGPUs[i])) == 0)
 256             return 1;
 257     }
 258     return 0;
 259 }
 260
 261 /*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
 262  *
 263  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 264  * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set.
 265  * \returns             TRUE if the device is supported, otherwise FALSE
 266  *
 267  */
 268 gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
 269 {
 270     cudaDeviceProp dev_prop;
 271
 272     if (debug) fprintf(debug, "Checking compatibility with device #%d, %s\n", dev_id, gpu_name);
 273
 274     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 275         return -1;
 276
 277     if (gpu_name != NULL)
 278     {
 279         strcpy(gpu_name, dev_prop.name);
 280     }
 281     return is_gmx_openmm_supported_gpu_name(dev_prop.name);
 282 }
 283
 284
 285 /*!
 286  * \brief Runs a set of memory tests specified by the given bit-flags.
 287  * Tries to allocate and do the test on \p megs Mb memory or
 288  * the greatest amount that can be allocated (>10Mb).
 289  * In case if an error is detected it stops without finishing the remaining
 290  * steps/iterations and returns greater then zero value.
 291  * In case of other errors (e.g. kernel launch errors, device querying errors)
 292  * -1 is returned.
 293  *
 294  * \param[in] which_tests   variable with bit-flags of the requested tests
 295  * \param[in] megs          amount of memory that will be tested in MB
 296  * \param[in] iter          number of iterations
 297  * \returns                 0 if no error was detected, otherwise >0
 298  */
 299 static int do_memtest(unsigned int which_tests, int megs, int iter)
 300 {
 301     memtestState    tester;
 302     int             i;
 303     uint            err_count; //, err_iter;
 304
 305     // no parameter check as this fn won't be called externally
 306
 307     // let's try to allocate the mem
 308     while (!tester.allocate(megs) && (megs - 10 > 0))
 309         { megs -= 10; tester.deallocate(); }
 310
 311     if (megs <= 10)
 312     {
 313         fprintf(stderr, "Unable to allocate GPU memory!\n");
 314         return -1;
 315     }
 316
 317     // clear the first 18 bits
 318     which_tests &= 0x3FFF;
 319     for (i = 0; i < iter; i++)
 320     {
 321         // Moving Inversions (ones and zeros)
 322         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 323         {
 324             tester.gpuMovingInversionsOnesZeros(err_count);
 325             if (err_count > 0)
 326                 return MOVING_INVERSIONS_10;
 327         }
 328         // Moving Inversions (random)
 329         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 330         {
 331             tester.gpuMovingInversionsRandom(err_count);
 332             if (err_count > 0)
 333                 return MOVING_INVERSIONS_RAND;
 334         }
 335        // Memtest86 Walking 8-bit
 336         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 337         {
 338             for (uint shift = 0; shift < 8; shift++)
 339             {
 340                 tester.gpuWalking8BitM86(err_count, shift);
 341                 if (err_count > 0)
 342                     return WALKING_8BIT_M86;
 343             }
 344       }
 345         // True Walking zeros (8-bit)
 346         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 347         {
 348             for (uint shift = 0; shift < 8; shift++)
 349             {
 350                 tester.gpuWalking8Bit(err_count, false, shift);
 351                 if (err_count > 0)
 352                     return WALKING_0_8BIT;
 353             }
 354         }
 355         // True Walking ones (8-bit)
 356         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 357         {
 358             for (uint shift = 0; shift < 8; shift++)
 359             {
 360                 tester.gpuWalking8Bit(err_count, true, shift);
 361                 if (err_count > 0)
 362                     return WALKING_1_8BIT;
 363             }
 364         }
 365         // Memtest86 Walking zeros (32-bit)
 366         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 367         {
 368             for (uint shift = 0; shift < 32; shift++)
 369             {
 370                 tester.gpuWalking32Bit(err_count, false, shift);
 371                 if (err_count > 0)
 372                     return WALKING_0_32BIT;
 373             }
 374         }
 375        // Memtest86 Walking ones (32-bit)
 376         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 377         {
 378             for (uint shift = 0; shift < 32; shift++)
 379             {
 380                 tester.gpuWalking32Bit(err_count, true, shift);
 381                 if (err_count > 0)
 382                     return WALKING_1_32BIT;
 383             }
 384        }
 385         // Random blocks
 386         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 387         {
 388             tester.gpuRandomBlocks(err_count,rand());
 389             if (err_count > 0)
 390                 return RANDOM_BLOCKS;
 391
 392         }
 393
 394         // Memtest86 Modulo-20
 395         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 396         {
 397             for (uint shift = 0; shift < 20; shift++)
 398             {
 399                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 400                 if (err_count > 0)
 401                     return MOD_20_32BIT;
 402             }
 403         }
 404         // Logic (one iteration)
 405         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 406         {
 407             tester.gpuShortLCG0(err_count,1);
 408             if (err_count > 0)
 409                 return LOGIC_1_ITER;
 410         }
 411         // Logic (4 iterations)
 412         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 413         {
 414             tester.gpuShortLCG0(err_count,4);
 415             if (err_count > 0)
 416                 return LOGIC_4_ITER;
 417
 418         }
 419         // Logic (shared memory, one iteration)
 420         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 421         {
 422             tester.gpuShortLCG0Shmem(err_count,1);
 423             if (err_count > 0)
 424                 return LOGIC_1_ITER_SHMEM;
 425         }
 426         // Logic (shared-memory, 4 iterations)
 427         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 428         {
 429             tester.gpuShortLCG0Shmem(err_count,4);
 430             if (err_count > 0)
 431                 return LOGIC_4_ITER_SHMEM;
 432         }
 433     }
 434
 435     tester.deallocate();
 436     return err_count;
 437 }
 438
 439 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 440  * If an error is detected it stops before completing the test and returns a
 441  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 442  * device querying errors) -1 is returned.
 443  *
 444  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 445  * \returns             0 if no error was detected, otherwise >0
 446  */
 447 int do_quick_memtest(int dev_id)
 448 {
 449     cudaDeviceProp  dev_prop;
 450     int             devmem, res, time=0;
 451
 452     if (debug) { time = getTimeMilliseconds(); }
 453
 454     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 455     {
 456         // something went wrong
 457         return -1;
 458     }
 459
 460     if (debug)
 461     {
 462         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 463         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 464             QUICK_MEM, devmem, QUICK_ITER);
 465     }
 466
 467     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 468
 469     if (debug)
 470     {
 471         fprintf(debug, "Q-RES = %d\n", res);
 472         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 473     }
 474
 475     /* destroy context only if we created it */
 476     if (dev_id !=-1) cudaThreadExit();
 477     return res;
 478 }
 479
 480 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 481  * If an error is detected  it stops before completing the test and returns a
 482  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 483  * device querying errors) -1 is returned.
 484  *
 485  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 486  * \returns             0 if no error was detected, otherwise >0
 487  */
 488
 489 int do_full_memtest(int dev_id)
 490 {
 491     cudaDeviceProp  dev_prop;
 492     int             devmem, res, time=0;
 493
 494     if (debug) { time = getTimeMilliseconds(); }
 495
 496     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 497     {
 498         // something went wrong
 499         return -1;
 500     }
 501
 502     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 503
 504     if (debug)
 505     {
 506         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 507             devmem, devmem, FULL_ITER);
 508     }
 509
 510     /* do all test on the entire memory */
 511     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 512
 513     if (debug)
 514     {
 515         fprintf(debug, "F-RES = %d\n", res);
 516         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 517     }
 518
 519     /* destroy context only if we created it */
 520     if (dev_id != -1) cudaThreadExit();
 521     return res;
 522 }
 523
 524 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 525  * If an error is detected it stops before completing the test and returns a value greater
 526  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 527  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 528  * the test will always be multipple of one iteration's runtime.
 529  *
 530  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 531  * \param[in] time_constr   the time limit of the testing
 532  * \returns                 0 if no error was detected, otherwise >0
 533  */
 534 int do_timed_memtest(int dev_id, int time_constr)
 535 {
 536     cudaDeviceProp  dev_prop;
 537     int             devmem, res=0, time=0, startt;
 538
 539     if (debug) { time = getTimeMilliseconds(); }
 540
 541     time_constr *= 1000;  /* convert to ms for convenience */
 542     startt = getTimeMilliseconds();
 543
 544     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 545     {
 546         // something went wrong
 547         return -1;
 548     }
 549
 550     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 551
 552     if (debug)
 553     {
 554         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 555         devmem, devmem, time_constr);
 556     }
 557
 558     /* do the TIMED_TESTS set, one step at a time on the entire memory
 559        that can be allocated, and stop when the given time is exceeded */
 560     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 561     {
 562         res = do_memtest(TIMED_TESTS, devmem, 1);
 563         if (res != 0) break;
 564     }
 565
 566     if (debug)
 567     {
 568         fprintf(debug, "T-RES = %d\n", res);
 569         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 570     }
 571
 572     /* destroy context only if we created it */
 573     if (dev_id != -1) cudaThreadExit();
 574     return res;
 575 }
 576
 577 /*! \brief Initializes the GPU with the given index.
 578  *
 579  * The varible \mygpu is the index of the GPU to initialize in the
 580  * gpu_info.cuda_dev array.
 581  *
 582  * \param[in]  mygpu        index of the GPU to initialize
 583  * \param[out] result_str   the message related to the error that occurred
 584  *                          during the initialization (if there was any).
 585  * \param[in] gpu_info      GPU info of all detected devices in the system.
 586  * \returns                 true if no error occurs during initialization.
 587  */
 588 gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 589 {
 590     cudaError_t stat;
 591     char sbuf[STRLEN];
 592     int gpuid;
 593
 594     assert(gpu_info);
 595     assert(result_str);
 596
 597     if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
 598     {
 599         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 600                 "there are %d %s-selected GPU(s), but #%d was requested.",
 601                  gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
 602         gmx_incons(sbuf);
 603     }
 604
 605     gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 606
 607     stat = cudaSetDevice(gpuid);
 608     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 609
 610     if (debug)
 611     {
 612         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 613     }
 614
 615     return (stat == cudaSuccess);
 616 }
 617
 618 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 619  *
 620  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 621  * is lost. This should only be called when none of this data is required anymore.
 622  *
 623  * \param[out] result_str   the message related to the error that occurred
 624  *                          during the initialization (if there was any).
 625  * \returns                 true if no error occurs during the freeing.
 626  */
 627 gmx_bool free_gpu(char *result_str)
 628 {
 629     cudaError_t stat;
 630
 631     assert(result_str);
 632
 633     if (debug)
 634     {
 635         int gpuid;
 636         stat = cudaGetDevice(&gpuid);
 637         CU_RET_ERR(stat, "cudaGetDevice failed");
 638         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 639     }
 640
 641 #if CUDA_VERSION < 4000
 642     stat = cudaThreadExit();
 643 #else
 644     stat = cudaDeviceReset();
 645 #endif
 646     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 647
 648     return (stat == cudaSuccess);
 649 }
 650
 651 /*! \brief Returns true if the gpu characterized by the device properties is
 652  *  supported by the native gpu acceleration.
 653  *
 654  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 655  * \returns             true if the GPU properties passed indicate a compatible
 656  *                      GPU, otherwise false.
 657  */
 658 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 659 {
 660     return (dev_prop->major >= 2);
 661 }
 662
 663 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 664  *
 665  * \param[in] stat  GPU status.
 666  * \returns         true if the provided status is egpuCompatible, otherwise false.
 667  */
 668 static bool is_compatible_gpu(int stat)
 669 {
 670     return (stat == egpuCompatible);
 671 }
 672
 673 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 674  *
 675  *  Returns a status value which indicates compatibility or one of the following
 676  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 677  *  It also returns the respective device's properties in \dev_prop (if applicable).
 678  *
 679  *  \param[in]  dev_id   the ID of the GPU to check.
 680  *  \param[out] dev_prop the CUDA device properties of the device checked.
 681  *  \returns             the status of the requested device
 682  */
 683 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 684 {
 685     cudaError_t stat;
 686     int         ndev;
 687
 688     stat = cudaGetDeviceCount(&ndev);
 689     if (stat != cudaSuccess)
 690     {
 691         return egpuInsane;
 692     }
 693
 694     if (dev_id > ndev - 1)
 695     {
 696         return egpuNonexistent;
 697     }
 698
 699     /* TODO: currently we do not make a distinction between the type of errors
 700      * that can appear during sanity checks. This needs to be improved, e.g if
 701      * the dummy test kernel fails to execute with a "device busy message" we
 702      * should appropriately report that the device is busy instead of insane.
 703      */
 704     if (do_sanity_checks(dev_id, dev_prop) == 0)
 705     {
 706         if (is_gmx_supported_gpu(dev_prop))
 707         {
 708             return egpuCompatible;
 709         }
 710         else
 711         {
 712             return egpuIncompatible;
 713         }
 714     }
 715     else
 716     {
 717         return egpuInsane;
 718     }
 719 }
 720
 721
 722 /*! \brief Detect all NVIDIA GPUs in the system.
 723  *
 724  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 725  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 726  *  with the required information on each the device: ID, device properties,
 727  *  status.
 728  *
 729  *  \param[in] gpu_info    pointer to structure holding GPU information.
 730  *  \param[out] err_str    The error message of any CUDA API error that caused
 731  *                         the detection to fail (if there was any). The memory
 732  *                         the pointer points to should be managed externally.
 733  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 734  */
 735 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 736 {
 737     int             i, ndev, checkres, retval;
 738     cudaError_t     stat;
 739     cudaDeviceProp  prop;
 740     cuda_dev_info_t *devs;
 741
 742     assert(gpu_info);
 743     assert(err_str);
 744
 745     ndev    = 0;
 746     devs    = NULL;
 747
 748     stat = cudaGetDeviceCount(&ndev);
 749     if (stat != cudaSuccess)
 750     {
 751         const char *s;
 752
 753         /* cudaGetDeviceCount failed which means that there is something
 754          * wrong with the machine: driver-runtime mismatch, all GPUs being
 755          * busy in exclusive mode, or some other condition which should
 756          * result in us issuing a warning a falling back to CPUs. */
 757         retval = -1;
 758         s = cudaGetErrorString(stat);
 759         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 760     }
 761     else
 762     {
 763         snew(devs, ndev);
 764         for (i = 0; i < ndev; i++)
 765         {
 766             checkres = is_gmx_supported_gpu_id(i, &prop);
 767
 768             devs[i].id   = i;
 769             devs[i].prop = prop;
 770             devs[i].stat = checkres;
 771         }
 772         retval = 0;
 773     }
 774
 775     gpu_info->ncuda_dev = ndev;
 776     gpu_info->cuda_dev  = devs;
 777
 778     return retval;
 779 }
 780
 781 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 782  *
 783  * This function selects the compatible gpus and initializes
 784  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 785  *
 786  * Given the list of GPUs available in the system the it checks each gpu in
 787  * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
 788  * the compatible ones into cuda_dev_use with this marking the respective
 789  * GPUs as "available for use."
 790  * Note that \detect_cuda_gpus must have been called before.
 791  *
 792  * \param[in]    gpu_info    pointer to structure holding GPU information
 793  */
 794 void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 795 {
 796     int i, ncompat;
 797     int *compat;
 798
 799     assert(gpu_info);
 800     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 801     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 802
 803     snew(compat, gpu_info->ncuda_dev);
 804     ncompat = 0;
 805     for (i = 0; i < gpu_info->ncuda_dev; i++)
 806     {
 807         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 808         {
 809             ncompat++;
 810             compat[ncompat - 1] = i;
 811         }
 812     }
 813
 814     gpu_info->ncuda_dev_use = ncompat;
 815     snew(gpu_info->cuda_dev_use, ncompat);
 816     memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
 817     sfree(compat);
 818 }
 819
 820 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 821  *
 822  * Given the a list of GPU devide IDs in \requested_devs, check for the
 823  * existence and compatibility of the respective GPUs and fill in \gpu_info
 824  * with the collected information. Also provide the caller with an array with
 825  * the result of checks in \checkres.
 826  *
 827  * \param[out]  checkres    check result for each ID passed in \requested_devs
 828  * \param[in]   gpu_info    pointer to structure holding GPU information
 829  * \param[in]   requested_devs array of requested device IDs
 830  * \param[in]   count       number of IDs in \requested_devs
 831  * \returns                 TRUE if every requested GPU is compatible
 832  */
 833 gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 834                                 const int *requested_devs, int count)
 835 {
 836     int i, id;
 837     bool bAllOk;
 838
 839     assert(checkres);
 840     assert(gpu_info);
 841     assert(requested_devs);
 842     assert(count >= 0);
 843
 844     if (count == 0)
 845     {
 846         return TRUE;
 847     }
 848
 849     /* we will assume that all GPUs requested are valid IDs,
 850        otherwise we'll bail anyways */
 851     gpu_info->ncuda_dev_use = count;
 852     snew(gpu_info->cuda_dev_use, count);
 853
 854     bAllOk = true;
 855     for (i = 0; i < count; i++)
 856     {
 857         id = requested_devs[i];
 858
 859         /* devices are stored in increasing order of IDs in cuda_dev */
 860         gpu_info->cuda_dev_use[i] = id;
 861
 862         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 863             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 864
 865         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 866     }
 867
 868     return bAllOk;
 869 }
 870
 871 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 872  *
 873  * \param[in]    gpu_info    pointer to structure holding GPU information
 874  */
 875 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 876 {
 877     if (gpu_info == NULL)
 878     {
 879         return;
 880     }
 881
 882     sfree(gpu_info->cuda_dev_use);
 883     sfree(gpu_info->cuda_dev);
 884 }
 885
 886 /*! \brief Formats and returns a device information string for a given GPU.
 887  *
 888  * Given an index *directly* into the array of available GPUs (cuda_dev)
 889  * returns a formatted info string for the respective GPU which includes
 890  * ID, name, compute capability, and detection status.
 891  *
 892  * \param[out]  s           pointer to output string (has to be allocated externally)
 893  * \param[in]   gpu_info    pointer to structure holding GPU information
 894  * \param[in]   index       an index *directly* into the array of available GPUs
 895  */
 896 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 897 {
 898     assert(s);
 899     assert(gpu_info);
 900
 901     if (index < 0 && index >= gpu_info->ncuda_dev)
 902     {
 903         return;
 904     }
 905
 906     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 907
 908     bool bGpuExists =
 909         dinfo->stat == egpuCompatible ||
 910         dinfo->stat == egpuIncompatible;
 911
 912     if (!bGpuExists)
 913     {
 914         sprintf(s, "#%d: %s, stat: %s",
 915                 dinfo->id, "N/A",
 916                 gpu_detect_res_str[dinfo->stat]);
 917     }
 918     else
 919     {
 920         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 921                 dinfo->id, dinfo->prop.name,
 922                 dinfo->prop.major, dinfo->prop.minor,
 923                 dinfo->prop.ECCEnabled ? "yes" : " no",
 924                 gpu_detect_res_str[dinfo->stat]);
 925     }
 926 }
 927
 928 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 929  *
 930  * Getter function which, given an index into the array of GPUs in use
 931  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 932  * respective CUDA GPU.
 933  *
 934  * \param[in]    gpu_info   pointer to structure holding GPU information
 935  * \param[in]    idx        index into the array of used GPUs
 936  * \returns                 device ID of the requested GPU
 937  */
 938 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
 939 {
 940     assert(gpu_info);
 941     if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
 942     {
 943         return -1;
 944     }
 945
 946     return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 947 }
 948
 949 /*! \brief Returns the device ID of the GPU currently in use.
 950  *
 951  * The GPU used is the one that is active at the time of the call in the active context.
 952  *
 953  * \param[in]    gpu_info   pointer to structure holding GPU information
 954  * \returns                 device ID of the GPU in use at the time of the call
 955  */
 956 int get_current_gpu_device_id(void)
 957 {
 958     int gpuid;
 959     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 960
 961     return gpuid;
 962 }