src/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2010, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <assert.h>
  42
  43 #include "smalloc.h"
  44 #include "string2.h"
  45 #include "types/hw_info.h"
  46
  47 #include "gpu_utils.h"
  48 #include "../cuda_tools/cudautils.cuh"
  49 #include "memtestG80_core.h"
  50
  51
  52 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
  53 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
  54                                                                             to run in quick memtest. */
  55 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
  56
  57 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
  58 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
  59
  60 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
  61                                                                             run in time constrained memtest. */
  62
  63 /*! Number of supported GPUs */
  64 #define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
  65
  66 static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
  67                                            In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
  68
  69 /*! Dummy kernel used for sanity checking. */
  70 __global__ void k_dummy_test(){}
  71
  72
  73 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
  74 enum memtest_G80_test_types {
  75     MOVING_INVERSIONS_10 =      0x1,
  76     MOVING_INVERSIONS_RAND =    0x2,
  77     WALKING_8BIT_M86 =          0x4,
  78     WALKING_0_8BIT =            0x8,
  79     WALKING_1_8BIT =            0x10,
  80     WALKING_0_32BIT =           0x20,
  81     WALKING_1_32BIT =           0x40,
  82     RANDOM_BLOCKS =             0x80,
  83     MOD_20_32BIT =              0x100,
  84     LOGIC_1_ITER =              0x200,
  85     LOGIC_4_ITER =              0x400,
  86     LOGIC_1_ITER_SHMEM =        0x800,
  87     LOGIC_4_ITER_SHMEM =        0x1000
  88 };
  89
  90 // TODO put this list into an external file and include it so that the list is easily accessible
  91 /*! List of supported GPUs. */
  92 static const char * const SupportedGPUs[] = {
  93     /* GT400 */
  94     "Geforce GTX 480",
  95     "Geforce GTX 470",
  96     "Geforce GTX 465",
  97     "Geforce GTX 460",
  98
  99     "Tesla C2070",
 100     "Tesla C2050",
 101     "Tesla S2070",
 102     "Tesla S2050",
 103     "Tesla M2070",
 104     "Tesla M2050",
 105
 106     "Quadro 5000",
 107     "Quadro 6000",
 108
 109     /* GT200 */
 110     "Geforce GTX 295",
 111     "Geforce GTX 285",
 112     "Geforce GTX 280",
 113     "Geforce GTX 275",
 114     "Geforce GTX 260",
 115     "GeForce GTS 250",
 116     "GeForce GTS 150",
 117
 118     "GeForce GTX 285M",
 119     "GeForce GTX 280M",
 120
 121     "Tesla S1070",
 122     "Tesla C1060",
 123     "Tesla M1060",
 124
 125     "Quadro FX 5800",
 126     "Quadro FX 4800",
 127     "Quadro CX",
 128     "Quadro Plex 2200 D2",
 129     "Quadro Plex 2200 S4",
 130
 131     /* G90 */
 132     "GeForce 9800 G", /* GX2, GTX, GTX+, GT */
 133     "GeForce 9800M GTX",
 134
 135     "Quadro FX 4700",
 136     "Quadro Plex 2100 D4"
 137 };
 138
 139
 140 /*!
 141   * \brief Runs GPU sanity checks.
 142   *
 143   * Runs a series of checks to determine that the given GPU and underlying CUDA
 144   * driver/runtime functions properly.
 145   * Returns properties of a device with given ID or the one that has
 146   * already been initialized earlier in the case if of \dev_id == -1.
 147   *
 148   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 149   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 150   * \returns                0 if the device looks OK
 151   *
 152   * TODO: introduce errors codes and handle errors more smoothly.
 153   */
 154 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 155 {
 156     cudaError_t cu_err;
 157     int         dev_count, id;
 158
 159     cu_err = cudaGetDeviceCount(&dev_count);
 160     if (cu_err != cudaSuccess)
 161     {
 162        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 163                cudaGetErrorString(cu_err));
 164         return -1;
 165     }
 166
 167     /* no CUDA compatible device at all */
 168     if (dev_count == 0)
 169         return -1;
 170
 171     /* things might go horribly wrong if cudart is not compatible with the driver */
 172     if (dev_count < 0 || dev_count > cuda_max_device_count)
 173         return -1;
 174
 175     if (dev_id == -1) /* device already selected let's not destroy the context */
 176     {
 177         cu_err = cudaGetDevice(&id);
 178         if (cu_err != cudaSuccess)
 179         {
 180             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 181                     cudaGetErrorString(cu_err));
 182             return -1;
 183         }
 184     }
 185     else
 186     {
 187         id = dev_id;
 188         if (id > dev_count - 1) /* pfff there's no such device */
 189         {
 190             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 191                     dev_id, dev_count);
 192             return -1;
 193         }
 194     }
 195
 196     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 197     cu_err = cudaGetDeviceProperties(dev_prop, id);
 198     if (cu_err != cudaSuccess)
 199     {
 200         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 201                 cudaGetErrorString(cu_err));
 202         return -1;
 203     }
 204
 205     /* both major & minor is 9999 if no CUDA capable devices are present */
 206     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 207         return -1;
 208     /* we don't care about emulation mode */
 209     if (dev_prop->major == 0)
 210         return -1;
 211
 212     if (id != -1)
 213     {
 214         cu_err = cudaSetDevice(id);
 215         if (cu_err != cudaSuccess)
 216         {
 217             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 218                     cu_err, id, cudaGetErrorString(cu_err));
 219             return -1;
 220         }
 221     }
 222
 223     /* try to execute a dummy kernel */
 224     k_dummy_test<<<1, 512>>>();
 225     if (cudaThreadSynchronize() != cudaSuccess)
 226     {
 227         return -1;
 228     }
 229
 230     /* destroy context if we created one */
 231     if (id != -1)
 232     {
 233 #if CUDA_VERSION < 4000
 234         cu_err = cudaThreadExit();
 235         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 236 #else
 237         cu_err = cudaDeviceReset();
 238         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 239 #endif
 240     }
 241
 242     return 0;
 243 }
 244
 245
 246 /*!
 247  * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
 248  *
 249  * \param[in] gpu_name  the name of the CUDA device
 250  * \returns             TRUE if the device is supported, otherwise FALSE
 251  */
 252 static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
 253 {
 254     size_t i;
 255     for (i = 0; i < NB_GPUS; i++)
 256     {
 257         trim(gpuName);
 258         if (gmx_strncasecmp(gpuName, SupportedGPUs[i], strlen(SupportedGPUs[i])) == 0)
 259             return 1;
 260     }
 261     return 0;
 262 }
 263
 264 /*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
 265  *
 266  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 267  * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set.
 268  * \returns             TRUE if the device is supported, otherwise FALSE
 269  *
 270  */
 271 gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
 272 {
 273     cudaDeviceProp dev_prop;
 274
 275     if (debug) fprintf(debug, "Checking compatibility with device #%d, %s\n", dev_id, gpu_name);
 276
 277     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 278         return -1;
 279
 280     if (gpu_name != NULL)
 281     {
 282         strcpy(gpu_name, dev_prop.name);
 283     }
 284     return is_gmx_openmm_supported_gpu_name(dev_prop.name);
 285 }
 286
 287
 288 /*!
 289  * \brief Runs a set of memory tests specified by the given bit-flags.
 290  * Tries to allocate and do the test on \p megs Mb memory or
 291  * the greatest amount that can be allocated (>10Mb).
 292  * In case if an error is detected it stops without finishing the remaining
 293  * steps/iterations and returns greater then zero value.
 294  * In case of other errors (e.g. kernel launch errors, device querying errors)
 295  * -1 is returned.
 296  *
 297  * \param[in] which_tests   variable with bit-flags of the requested tests
 298  * \param[in] megs          amount of memory that will be tested in MB
 299  * \param[in] iter          number of iterations
 300  * \returns                 0 if no error was detected, otherwise >0
 301  */
 302 static int do_memtest(unsigned int which_tests, int megs, int iter)
 303 {
 304     memtestState    tester;
 305     int             i;
 306     uint            err_count; //, err_iter;
 307
 308     // no parameter check as this fn won't be called externally
 309
 310     // let's try to allocate the mem
 311     while (!tester.allocate(megs) && (megs - 10 > 0))
 312         { megs -= 10; tester.deallocate(); }
 313
 314     if (megs <= 10)
 315     {
 316         fprintf(stderr, "Unable to allocate GPU memory!\n");
 317         return -1;
 318     }
 319
 320     // clear the first 18 bits
 321     which_tests &= 0x3FFF;
 322     for (i = 0; i < iter; i++)
 323     {
 324         // Moving Inversions (ones and zeros)
 325         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 326         {
 327             tester.gpuMovingInversionsOnesZeros(err_count);
 328             if (err_count > 0)
 329                 return MOVING_INVERSIONS_10;
 330         }
 331         // Moving Inversions (random)
 332         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 333         {
 334             tester.gpuMovingInversionsRandom(err_count);
 335             if (err_count > 0)
 336                 return MOVING_INVERSIONS_RAND;
 337         }
 338        // Memtest86 Walking 8-bit
 339         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 340         {
 341             for (uint shift = 0; shift < 8; shift++)
 342             {
 343                 tester.gpuWalking8BitM86(err_count, shift);
 344                 if (err_count > 0)
 345                     return WALKING_8BIT_M86;
 346             }
 347       }
 348         // True Walking zeros (8-bit)
 349         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 350         {
 351             for (uint shift = 0; shift < 8; shift++)
 352             {
 353                 tester.gpuWalking8Bit(err_count, false, shift);
 354                 if (err_count > 0)
 355                     return WALKING_0_8BIT;
 356             }
 357         }
 358         // True Walking ones (8-bit)
 359         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 360         {
 361             for (uint shift = 0; shift < 8; shift++)
 362             {
 363                 tester.gpuWalking8Bit(err_count, true, shift);
 364                 if (err_count > 0)
 365                     return WALKING_1_8BIT;
 366             }
 367         }
 368         // Memtest86 Walking zeros (32-bit)
 369         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 370         {
 371             for (uint shift = 0; shift < 32; shift++)
 372             {
 373                 tester.gpuWalking32Bit(err_count, false, shift);
 374                 if (err_count > 0)
 375                     return WALKING_0_32BIT;
 376             }
 377         }
 378        // Memtest86 Walking ones (32-bit)
 379         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 380         {
 381             for (uint shift = 0; shift < 32; shift++)
 382             {
 383                 tester.gpuWalking32Bit(err_count, true, shift);
 384                 if (err_count > 0)
 385                     return WALKING_1_32BIT;
 386             }
 387        }
 388         // Random blocks
 389         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 390         {
 391             tester.gpuRandomBlocks(err_count,rand());
 392             if (err_count > 0)
 393                 return RANDOM_BLOCKS;
 394
 395         }
 396
 397         // Memtest86 Modulo-20
 398         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 399         {
 400             for (uint shift = 0; shift < 20; shift++)
 401             {
 402                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 403                 if (err_count > 0)
 404                     return MOD_20_32BIT;
 405             }
 406         }
 407         // Logic (one iteration)
 408         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 409         {
 410             tester.gpuShortLCG0(err_count,1);
 411             if (err_count > 0)
 412                 return LOGIC_1_ITER;
 413         }
 414         // Logic (4 iterations)
 415         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 416         {
 417             tester.gpuShortLCG0(err_count,4);
 418             if (err_count > 0)
 419                 return LOGIC_4_ITER;
 420
 421         }
 422         // Logic (shared memory, one iteration)
 423         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 424         {
 425             tester.gpuShortLCG0Shmem(err_count,1);
 426             if (err_count > 0)
 427                 return LOGIC_1_ITER_SHMEM;
 428         }
 429         // Logic (shared-memory, 4 iterations)
 430         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 431         {
 432             tester.gpuShortLCG0Shmem(err_count,4);
 433             if (err_count > 0)
 434                 return LOGIC_4_ITER_SHMEM;
 435         }
 436     }
 437
 438     tester.deallocate();
 439     return err_count;
 440 }
 441
 442 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 443  * If an error is detected it stops before completing the test and returns a
 444  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 445  * device querying errors) -1 is returned.
 446  *
 447  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 448  * \returns             0 if no error was detected, otherwise >0
 449  */
 450 int do_quick_memtest(int dev_id)
 451 {
 452     cudaDeviceProp  dev_prop;
 453     int             devmem, res, time=0;
 454
 455     if (debug) { time = getTimeMilliseconds(); }
 456
 457     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 458     {
 459         // something went wrong
 460         return -1;
 461     }
 462
 463     if (debug)
 464     {
 465         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 466         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 467             QUICK_MEM, devmem, QUICK_ITER);
 468     }
 469
 470     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 471
 472     if (debug)
 473     {
 474         fprintf(debug, "Q-RES = %d\n", res);
 475         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 476     }
 477
 478     /* destroy context only if we created it */
 479     if (dev_id !=-1) cudaThreadExit();
 480     return res;
 481 }
 482
 483 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 484  * If an error is detected  it stops before completing the test and returns a
 485  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 486  * device querying errors) -1 is returned.
 487  *
 488  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 489  * \returns             0 if no error was detected, otherwise >0
 490  */
 491
 492 int do_full_memtest(int dev_id)
 493 {
 494     cudaDeviceProp  dev_prop;
 495     int             devmem, res, time=0;
 496
 497     if (debug) { time = getTimeMilliseconds(); }
 498
 499     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 500     {
 501         // something went wrong
 502         return -1;
 503     }
 504
 505     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 506
 507     if (debug)
 508     {
 509         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 510             devmem, devmem, FULL_ITER);
 511     }
 512
 513     /* do all test on the entire memory */
 514     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 515
 516     if (debug)
 517     {
 518         fprintf(debug, "F-RES = %d\n", res);
 519         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 520     }
 521
 522     /* destroy context only if we created it */
 523     if (dev_id != -1) cudaThreadExit();
 524     return res;
 525 }
 526
 527 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 528  * If an error is detected it stops before completing the test and returns a value greater
 529  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 530  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 531  * the test will always be multipple of one iteration's runtime.
 532  *
 533  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 534  * \param[in] time_constr   the time limit of the testing
 535  * \returns                 0 if no error was detected, otherwise >0
 536  */
 537 int do_timed_memtest(int dev_id, int time_constr)
 538 {
 539     cudaDeviceProp  dev_prop;
 540     int             devmem, res=0, time=0, startt;
 541
 542     if (debug) { time = getTimeMilliseconds(); }
 543
 544     time_constr *= 1000;  /* convert to ms for convenience */
 545     startt = getTimeMilliseconds();
 546
 547     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 548     {
 549         // something went wrong
 550         return -1;
 551     }
 552
 553     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 554
 555     if (debug)
 556     {
 557         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 558         devmem, devmem, time_constr);
 559     }
 560
 561     /* do the TIMED_TESTS set, one step at a time on the entire memory
 562        that can be allocated, and stop when the given time is exceeded */
 563     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 564     {
 565         res = do_memtest(TIMED_TESTS, devmem, 1);
 566         if (res != 0) break;
 567     }
 568
 569     if (debug)
 570     {
 571         fprintf(debug, "T-RES = %d\n", res);
 572         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 573     }
 574
 575     /* destroy context only if we created it */
 576     if (dev_id != -1) cudaThreadExit();
 577     return res;
 578 }
 579
 580 /*! \brief Initializes the GPU with the given index.
 581  *
 582  * The varible \mygpu is the index of the GPU to initialize in the
 583  * gpu_info.cuda_dev array.
 584  *
 585  * \param[in]  mygpu        index of the GPU to initialize
 586  * \param[out] result_str   the message related to the error that occurred
 587  *                          during the initialization (if there was any).
 588  * \param[in] gpu_info      GPU info of all detected devices in the system.
 589  * \returns                 true if no error occurs during initialization.
 590  */
 591 gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 592 {
 593     cudaError_t stat;
 594     char sbuf[STRLEN];
 595     int gpuid;
 596
 597     assert(gpu_info);
 598     assert(result_str);
 599
 600     if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
 601     {
 602         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 603                 "there are %d %s-selected GPU(s), but #%d was requested.",
 604                  gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
 605         gmx_incons(sbuf);
 606     }
 607
 608     gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 609
 610     stat = cudaSetDevice(gpuid);
 611     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 612
 613     if (debug)
 614     {
 615         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 616     }
 617
 618     return (stat == cudaSuccess);
 619 }
 620
 621 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 622  *
 623  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 624  * is lost. This should only be called when none of this data is required anymore.
 625  *
 626  * \param[out] result_str   the message related to the error that occurred
 627  *                          during the initialization (if there was any).
 628  * \returns                 true if no error occurs during the freeing.
 629  */
 630 gmx_bool free_gpu(char *result_str)
 631 {
 632     cudaError_t stat;
 633
 634     assert(result_str);
 635
 636     if (debug)
 637     {
 638         int gpuid;
 639         stat = cudaGetDevice(&gpuid);
 640         CU_RET_ERR(stat, "cudaGetDevice failed");
 641         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 642     }
 643
 644 #if CUDA_VERSION < 4000
 645     stat = cudaThreadExit();
 646 #else
 647     stat = cudaDeviceReset();
 648 #endif
 649     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 650
 651     return (stat == cudaSuccess);
 652 }
 653
 654 /*! \brief Returns true if the gpu characterized by the device properties is
 655  *  supported by the native gpu acceleration.
 656  *
 657  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 658  * \returns             true if the GPU properties passed indicate a compatible
 659  *                      GPU, otherwise false.
 660  */
 661 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 662 {
 663     return (dev_prop->major >= 2);
 664 }
 665
 666 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 667  *
 668  * \param[in] stat  GPU status.
 669  * \returns         true if the provided status is egpuCompatible, otherwise false.
 670  */
 671 static bool is_compatible_gpu(int stat)
 672 {
 673     return (stat == egpuCompatible);
 674 }
 675
 676 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 677  *
 678  *  Returns a status value which indicates compatibility or one of the following
 679  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 680  *  It also returns the respective device's properties in \dev_prop (if applicable).
 681  *
 682  *  \param[in]  dev_id   the ID of the GPU to check.
 683  *  \param[out] dev_prop the CUDA device properties of the device checked.
 684  *  \returns             the status of the requested device
 685  */
 686 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 687 {
 688     cudaError_t stat;
 689     int         ndev;
 690
 691     stat = cudaGetDeviceCount(&ndev);
 692     if (stat != cudaSuccess)
 693     {
 694         return egpuInsane;
 695     }
 696
 697     if (dev_id > ndev - 1)
 698     {
 699         return egpuNonexistent;
 700     }
 701
 702     /* TODO: currently we do not make a distinction between the type of errors
 703      * that can appear during sanity checks. This needs to be improved, e.g if
 704      * the dummy test kernel fails to execute with a "device busy message" we
 705      * should appropriately report that the device is busy instead of insane.
 706      */
 707     if (do_sanity_checks(dev_id, dev_prop) == 0)
 708     {
 709         if (is_gmx_supported_gpu(dev_prop))
 710         {
 711             return egpuCompatible;
 712         }
 713         else
 714         {
 715             return egpuIncompatible;
 716         }
 717     }
 718     else
 719     {
 720         return egpuInsane;
 721     }
 722 }
 723
 724
 725 /*! \brief Detect all NVIDIA GPUs in the system.
 726  *
 727  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 728  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 729  *  with the required information on each the device: ID, device properties,
 730  *  status.
 731  *
 732  *  \param[in] gpu_info    pointer to structure holding GPU information.
 733  *  \param[out] err_str    The error message of any CUDA API error that caused
 734  *                         the detection to fail (if there was any). The memory
 735  *                         the pointer points to should be managed externally.
 736  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 737  */
 738 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 739 {
 740     int             i, ndev, checkres, retval;
 741     cudaError_t     stat;
 742     cudaDeviceProp  prop;
 743     cuda_dev_info_t *devs;
 744
 745     assert(gpu_info);
 746     assert(err_str);
 747
 748     ndev    = 0;
 749     devs    = NULL;
 750
 751     stat = cudaGetDeviceCount(&ndev);
 752     if (stat != cudaSuccess)
 753     {
 754         const char *s;
 755
 756         /* cudaGetDeviceCount failed which means that there is something
 757          * wrong with the machine: driver-runtime mismatch, all GPUs being
 758          * busy in exclusive mode, or some other condition which should
 759          * result in us issuing a warning a falling back to CPUs. */
 760         retval = -1;
 761         s = cudaGetErrorString(stat);
 762         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 763     }
 764     else
 765     {
 766         snew(devs, ndev);
 767         for (i = 0; i < ndev; i++)
 768         {
 769             checkres = is_gmx_supported_gpu_id(i, &prop);
 770
 771             devs[i].id   = i;
 772             devs[i].prop = prop;
 773             devs[i].stat = checkres;
 774         }
 775         retval = 0;
 776     }
 777
 778     gpu_info->ncuda_dev = ndev;
 779     gpu_info->cuda_dev  = devs;
 780
 781     return retval;
 782 }
 783
 784 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 785  *
 786  * This function selects the compatible gpus and initializes
 787  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 788  *
 789  * Given the list of GPUs available in the system the it checks each gpu in
 790  * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
 791  * the compatible ones into cuda_dev_use with this marking the respective
 792  * GPUs as "available for use."
 793  * Note that \detect_cuda_gpus must have been called before.
 794  *
 795  * \param[in]    gpu_info    pointer to structure holding GPU information
 796  */
 797 void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 798 {
 799     int i, ncompat;
 800     int *compat;
 801
 802     assert(gpu_info);
 803     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 804     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 805
 806     snew(compat, gpu_info->ncuda_dev);
 807     ncompat = 0;
 808     for (i = 0; i < gpu_info->ncuda_dev; i++)
 809     {
 810         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 811         {
 812             ncompat++;
 813             compat[ncompat - 1] = i;
 814         }
 815     }
 816
 817     gpu_info->ncuda_dev_use = ncompat;
 818     snew(gpu_info->cuda_dev_use, ncompat);
 819     memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
 820     sfree(compat);
 821 }
 822
 823 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 824  *
 825  * Given the a list of GPU devide IDs in \requested_devs, check for the
 826  * existence and compatibility of the respective GPUs and fill in \gpu_info
 827  * with the collected information. Also provide the caller with an array with
 828  * the result of checks in \checkres.
 829  *
 830  * \param[out]  checkres    check result for each ID passed in \requested_devs
 831  * \param[in]   gpu_info    pointer to structure holding GPU information
 832  * \param[in]   requested_devs array of requested device IDs
 833  * \param[in]   count       number of IDs in \requested_devs
 834  * \returns                 TRUE if every requested GPU is compatible
 835  */
 836 gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 837                                 const int *requested_devs, int count)
 838 {
 839     int i, id;
 840     bool bAllOk;
 841
 842     assert(checkres);
 843     assert(gpu_info);
 844     assert(requested_devs);
 845     assert(count >= 0);
 846
 847     if (count == 0)
 848     {
 849         return TRUE;
 850     }
 851
 852     /* we will assume that all GPUs requested are valid IDs,
 853        otherwise we'll bail anyways */
 854     gpu_info->ncuda_dev_use = count;
 855     snew(gpu_info->cuda_dev_use, count);
 856
 857     bAllOk = true;
 858     for (i = 0; i < count; i++)
 859     {
 860         id = requested_devs[i];
 861
 862         /* devices are stored in increasing order of IDs in cuda_dev */
 863         gpu_info->cuda_dev_use[i] = id;
 864
 865         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 866             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 867
 868         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 869     }
 870
 871     return bAllOk;
 872 }
 873
 874 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 875  *
 876  * \param[in]    gpu_info    pointer to structure holding GPU information
 877  */
 878 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 879 {
 880     if (gpu_info == NULL)
 881     {
 882         return;
 883     }
 884
 885     sfree(gpu_info->cuda_dev_use);
 886     sfree(gpu_info->cuda_dev);
 887 }
 888
 889 /*! \brief Formats and returns a device information string for a given GPU.
 890  *
 891  * Given an index *directly* into the array of available GPUs (cuda_dev)
 892  * returns a formatted info string for the respective GPU which includes
 893  * ID, name, compute capability, and detection status.
 894  *
 895  * \param[out]  s           pointer to output string (has to be allocated externally)
 896  * \param[in]   gpu_info    pointer to structure holding GPU information
 897  * \param[in]   index       an index *directly* into the array of available GPUs
 898  */
 899 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 900 {
 901     assert(s);
 902     assert(gpu_info);
 903
 904     if (index < 0 && index >= gpu_info->ncuda_dev)
 905     {
 906         return;
 907     }
 908
 909     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 910
 911     bool bGpuExists =
 912         dinfo->stat == egpuCompatible ||
 913         dinfo->stat == egpuIncompatible;
 914
 915     if (!bGpuExists)
 916     {
 917         sprintf(s, "#%d: %s, stat: %s",
 918                 dinfo->id, "N/A",
 919                 gpu_detect_res_str[dinfo->stat]);
 920     }
 921     else
 922     {
 923         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 924                 dinfo->id, dinfo->prop.name,
 925                 dinfo->prop.major, dinfo->prop.minor,
 926                 dinfo->prop.ECCEnabled ? "yes" : " no",
 927                 gpu_detect_res_str[dinfo->stat]);
 928     }
 929 }
 930
 931 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 932  *
 933  * Getter function which, given an index into the array of GPUs in use
 934  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 935  * respective CUDA GPU.
 936  *
 937  * \param[in]    gpu_info   pointer to structure holding GPU information
 938  * \param[in]    idx        index into the array of used GPUs
 939  * \returns                 device ID of the requested GPU
 940  */
 941 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
 942 {
 943     assert(gpu_info);
 944     if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
 945     {
 946         return -1;
 947     }
 948
 949     return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 950 }
 951
 952 /*! \brief Returns the device ID of the GPU currently in use.
 953  *
 954  * The GPU used is the one that is active at the time of the call in the active context.
 955  *
 956  * \param[in]    gpu_info   pointer to structure holding GPU information
 957  * \returns                 device ID of the GPU in use at the time of the call
 958  */
 959 int get_current_gpu_device_id(void)
 960 {
 961     int gpuid;
 962     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 963
 964     return gpuid;
 965 }