src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2010,2012 The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "smalloc.h"
  41 #include "string2.h"
  42 #include "types/hw_info.h"
  43
  44 #include "gpu_utils.h"
  45 #include "../cuda_tools/cudautils.cuh"
  46 #include "memtestG80_core.h"
  47
  48
  49 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
  50 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
  51                                                                             to run in quick memtest. */
  52 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
  53
  54 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
  55 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
  56
  57 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
  58                                                                             run in time constrained memtest. */
  59
  60 static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
  61                                            In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
  62
  63 /*! Dummy kernel used for sanity checking. */
  64 __global__ void k_dummy_test(){}
  65
  66
  67 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
  68 enum memtest_G80_test_types {
  69     MOVING_INVERSIONS_10 =      0x1,
  70     MOVING_INVERSIONS_RAND =    0x2,
  71     WALKING_8BIT_M86 =          0x4,
  72     WALKING_0_8BIT =            0x8,
  73     WALKING_1_8BIT =            0x10,
  74     WALKING_0_32BIT =           0x20,
  75     WALKING_1_32BIT =           0x40,
  76     RANDOM_BLOCKS =             0x80,
  77     MOD_20_32BIT =              0x100,
  78     LOGIC_1_ITER =              0x200,
  79     LOGIC_4_ITER =              0x400,
  80     LOGIC_1_ITER_SHMEM =        0x800,
  81     LOGIC_4_ITER_SHMEM =        0x1000
  82 };
  83
  84
  85 /*!
  86   * \brief Runs GPU sanity checks.
  87   *
  88   * Runs a series of checks to determine that the given GPU and underlying CUDA
  89   * driver/runtime functions properly.
  90   * Returns properties of a device with given ID or the one that has
  91   * already been initialized earlier in the case if of \dev_id == -1.
  92   *
  93   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
  94   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
  95   * \returns                0 if the device looks OK
  96   *
  97   * TODO: introduce errors codes and handle errors more smoothly.
  98   */
  99 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 100 {
 101     cudaError_t cu_err;
 102     int         dev_count, id;
 103
 104     cu_err = cudaGetDeviceCount(&dev_count);
 105     if (cu_err != cudaSuccess)
 106     {
 107        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 108                cudaGetErrorString(cu_err));
 109         return -1;
 110     }
 111
 112     /* no CUDA compatible device at all */
 113     if (dev_count == 0)
 114         return -1;
 115
 116     /* things might go horribly wrong if cudart is not compatible with the driver */
 117     if (dev_count < 0 || dev_count > cuda_max_device_count)
 118         return -1;
 119
 120     if (dev_id == -1) /* device already selected let's not destroy the context */
 121     {
 122         cu_err = cudaGetDevice(&id);
 123         if (cu_err != cudaSuccess)
 124         {
 125             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 126                     cudaGetErrorString(cu_err));
 127             return -1;
 128         }
 129     }
 130     else
 131     {
 132         id = dev_id;
 133         if (id > dev_count - 1) /* pfff there's no such device */
 134         {
 135             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 136                     dev_id, dev_count);
 137             return -1;
 138         }
 139     }
 140
 141     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 142     cu_err = cudaGetDeviceProperties(dev_prop, id);
 143     if (cu_err != cudaSuccess)
 144     {
 145         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 146                 cudaGetErrorString(cu_err));
 147         return -1;
 148     }
 149
 150     /* both major & minor is 9999 if no CUDA capable devices are present */
 151     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 152         return -1;
 153     /* we don't care about emulation mode */
 154     if (dev_prop->major == 0)
 155         return -1;
 156
 157     if (id != -1)
 158     {
 159         cu_err = cudaSetDevice(id);
 160         if (cu_err != cudaSuccess)
 161         {
 162             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 163                     cu_err, id, cudaGetErrorString(cu_err));
 164             return -1;
 165         }
 166     }
 167
 168     /* try to execute a dummy kernel */
 169     k_dummy_test<<<1, 512>>>();
 170     if (cudaThreadSynchronize() != cudaSuccess)
 171     {
 172         return -1;
 173     }
 174
 175     /* destroy context if we created one */
 176     if (id != -1)
 177     {
 178 #if CUDA_VERSION < 4000
 179         cu_err = cudaThreadExit();
 180         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 181 #else
 182         cu_err = cudaDeviceReset();
 183         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 184 #endif
 185     }
 186
 187     return 0;
 188 }
 189
 190
 191 /*!
 192  * \brief Runs a set of memory tests specified by the given bit-flags.
 193  * Tries to allocate and do the test on \p megs Mb memory or
 194  * the greatest amount that can be allocated (>10Mb).
 195  * In case if an error is detected it stops without finishing the remaining
 196  * steps/iterations and returns greater then zero value.
 197  * In case of other errors (e.g. kernel launch errors, device querying errors)
 198  * -1 is returned.
 199  *
 200  * \param[in] which_tests   variable with bit-flags of the requested tests
 201  * \param[in] megs          amount of memory that will be tested in MB
 202  * \param[in] iter          number of iterations
 203  * \returns                 0 if no error was detected, otherwise >0
 204  */
 205 static int do_memtest(unsigned int which_tests, int megs, int iter)
 206 {
 207     memtestState    tester;
 208     int             i;
 209     uint            err_count; //, err_iter;
 210
 211     // no parameter check as this fn won't be called externally
 212
 213     // let's try to allocate the mem
 214     while (!tester.allocate(megs) && (megs - 10 > 0))
 215         { megs -= 10; tester.deallocate(); }
 216
 217     if (megs <= 10)
 218     {
 219         fprintf(stderr, "Unable to allocate GPU memory!\n");
 220         return -1;
 221     }
 222
 223     // clear the first 18 bits
 224     which_tests &= 0x3FFF;
 225     for (i = 0; i < iter; i++)
 226     {
 227         // Moving Inversions (ones and zeros)
 228         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 229         {
 230             tester.gpuMovingInversionsOnesZeros(err_count);
 231             if (err_count > 0)
 232                 return MOVING_INVERSIONS_10;
 233         }
 234         // Moving Inversions (random)
 235         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 236         {
 237             tester.gpuMovingInversionsRandom(err_count);
 238             if (err_count > 0)
 239                 return MOVING_INVERSIONS_RAND;
 240         }
 241        // Memtest86 Walking 8-bit
 242         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 243         {
 244             for (uint shift = 0; shift < 8; shift++)
 245             {
 246                 tester.gpuWalking8BitM86(err_count, shift);
 247                 if (err_count > 0)
 248                     return WALKING_8BIT_M86;
 249             }
 250       }
 251         // True Walking zeros (8-bit)
 252         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 253         {
 254             for (uint shift = 0; shift < 8; shift++)
 255             {
 256                 tester.gpuWalking8Bit(err_count, false, shift);
 257                 if (err_count > 0)
 258                     return WALKING_0_8BIT;
 259             }
 260         }
 261         // True Walking ones (8-bit)
 262         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 263         {
 264             for (uint shift = 0; shift < 8; shift++)
 265             {
 266                 tester.gpuWalking8Bit(err_count, true, shift);
 267                 if (err_count > 0)
 268                     return WALKING_1_8BIT;
 269             }
 270         }
 271         // Memtest86 Walking zeros (32-bit)
 272         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 273         {
 274             for (uint shift = 0; shift < 32; shift++)
 275             {
 276                 tester.gpuWalking32Bit(err_count, false, shift);
 277                 if (err_count > 0)
 278                     return WALKING_0_32BIT;
 279             }
 280         }
 281        // Memtest86 Walking ones (32-bit)
 282         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 283         {
 284             for (uint shift = 0; shift < 32; shift++)
 285             {
 286                 tester.gpuWalking32Bit(err_count, true, shift);
 287                 if (err_count > 0)
 288                     return WALKING_1_32BIT;
 289             }
 290        }
 291         // Random blocks
 292         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 293         {
 294             tester.gpuRandomBlocks(err_count,rand());
 295             if (err_count > 0)
 296                 return RANDOM_BLOCKS;
 297
 298         }
 299
 300         // Memtest86 Modulo-20
 301         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 302         {
 303             for (uint shift = 0; shift < 20; shift++)
 304             {
 305                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 306                 if (err_count > 0)
 307                     return MOD_20_32BIT;
 308             }
 309         }
 310         // Logic (one iteration)
 311         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 312         {
 313             tester.gpuShortLCG0(err_count,1);
 314             if (err_count > 0)
 315                 return LOGIC_1_ITER;
 316         }
 317         // Logic (4 iterations)
 318         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 319         {
 320             tester.gpuShortLCG0(err_count,4);
 321             if (err_count > 0)
 322                 return LOGIC_4_ITER;
 323
 324         }
 325         // Logic (shared memory, one iteration)
 326         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 327         {
 328             tester.gpuShortLCG0Shmem(err_count,1);
 329             if (err_count > 0)
 330                 return LOGIC_1_ITER_SHMEM;
 331         }
 332         // Logic (shared-memory, 4 iterations)
 333         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 334         {
 335             tester.gpuShortLCG0Shmem(err_count,4);
 336             if (err_count > 0)
 337                 return LOGIC_4_ITER_SHMEM;
 338         }
 339     }
 340
 341     tester.deallocate();
 342     return err_count;
 343 }
 344
 345 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 346  * If an error is detected it stops before completing the test and returns a
 347  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 348  * device querying errors) -1 is returned.
 349  *
 350  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 351  * \returns             0 if no error was detected, otherwise >0
 352  */
 353 int do_quick_memtest(int dev_id)
 354 {
 355     cudaDeviceProp  dev_prop;
 356     int             devmem, res, time=0;
 357
 358     if (debug) { time = getTimeMilliseconds(); }
 359
 360     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 361     {
 362         // something went wrong
 363         return -1;
 364     }
 365
 366     if (debug)
 367     {
 368         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 369         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 370             QUICK_MEM, devmem, QUICK_ITER);
 371     }
 372
 373     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 374
 375     if (debug)
 376     {
 377         fprintf(debug, "Q-RES = %d\n", res);
 378         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 379     }
 380
 381     /* destroy context only if we created it */
 382     if (dev_id !=-1) cudaThreadExit();
 383     return res;
 384 }
 385
 386 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 387  * If an error is detected  it stops before completing the test and returns a
 388  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 389  * device querying errors) -1 is returned.
 390  *
 391  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 392  * \returns             0 if no error was detected, otherwise >0
 393  */
 394
 395 int do_full_memtest(int dev_id)
 396 {
 397     cudaDeviceProp  dev_prop;
 398     int             devmem, res, time=0;
 399
 400     if (debug) { time = getTimeMilliseconds(); }
 401
 402     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 403     {
 404         // something went wrong
 405         return -1;
 406     }
 407
 408     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 409
 410     if (debug)
 411     {
 412         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 413             devmem, devmem, FULL_ITER);
 414     }
 415
 416     /* do all test on the entire memory */
 417     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 418
 419     if (debug)
 420     {
 421         fprintf(debug, "F-RES = %d\n", res);
 422         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 423     }
 424
 425     /* destroy context only if we created it */
 426     if (dev_id != -1) cudaThreadExit();
 427     return res;
 428 }
 429
 430 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 431  * If an error is detected it stops before completing the test and returns a value greater
 432  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 433  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 434  * the test will always be multipple of one iteration's runtime.
 435  *
 436  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 437  * \param[in] time_constr   the time limit of the testing
 438  * \returns                 0 if no error was detected, otherwise >0
 439  */
 440 int do_timed_memtest(int dev_id, int time_constr)
 441 {
 442     cudaDeviceProp  dev_prop;
 443     int             devmem, res=0, time=0, startt;
 444
 445     if (debug) { time = getTimeMilliseconds(); }
 446
 447     time_constr *= 1000;  /* convert to ms for convenience */
 448     startt = getTimeMilliseconds();
 449
 450     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 451     {
 452         // something went wrong
 453         return -1;
 454     }
 455
 456     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 457
 458     if (debug)
 459     {
 460         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 461         devmem, devmem, time_constr);
 462     }
 463
 464     /* do the TIMED_TESTS set, one step at a time on the entire memory
 465        that can be allocated, and stop when the given time is exceeded */
 466     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 467     {
 468         res = do_memtest(TIMED_TESTS, devmem, 1);
 469         if (res != 0) break;
 470     }
 471
 472     if (debug)
 473     {
 474         fprintf(debug, "T-RES = %d\n", res);
 475         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 476     }
 477
 478     /* destroy context only if we created it */
 479     if (dev_id != -1) cudaThreadExit();
 480     return res;
 481 }
 482
 483 /*! \brief Initializes the GPU with the given index.
 484  *
 485  * The varible \mygpu is the index of the GPU to initialize in the
 486  * gpu_info.cuda_dev array.
 487  *
 488  * \param[in]  mygpu        index of the GPU to initialize
 489  * \param[out] result_str   the message related to the error that occurred
 490  *                          during the initialization (if there was any).
 491  * \param[in] gpu_info      GPU info of all detected devices in the system.
 492  * \returns                 true if no error occurs during initialization.
 493  */
 494 gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
 495 {
 496     cudaError_t stat;
 497     char sbuf[STRLEN];
 498     int gpuid;
 499
 500     assert(gpu_info);
 501     assert(result_str);
 502
 503     if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
 504     {
 505         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 506                 "there are %d %s-selected GPU(s), but #%d was requested.",
 507                  gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
 508         gmx_incons(sbuf);
 509     }
 510
 511     gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
 512
 513     stat = cudaSetDevice(gpuid);
 514     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 515
 516     if (debug)
 517     {
 518         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 519     }
 520
 521     return (stat == cudaSuccess);
 522 }
 523
 524 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 525  *
 526  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 527  * is lost. This should only be called when none of this data is required anymore.
 528  *
 529  * \param[out] result_str   the message related to the error that occurred
 530  *                          during the initialization (if there was any).
 531  * \returns                 true if no error occurs during the freeing.
 532  */
 533 gmx_bool free_gpu(char *result_str)
 534 {
 535     cudaError_t stat;
 536
 537     assert(result_str);
 538
 539     if (debug)
 540     {
 541         int gpuid;
 542         stat = cudaGetDevice(&gpuid);
 543         CU_RET_ERR(stat, "cudaGetDevice failed");
 544         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 545     }
 546
 547 #if CUDA_VERSION < 4000
 548     stat = cudaThreadExit();
 549 #else
 550     stat = cudaDeviceReset();
 551 #endif
 552     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 553
 554     return (stat == cudaSuccess);
 555 }
 556
 557 /*! \brief Returns true if the gpu characterized by the device properties is
 558  *  supported by the native gpu acceleration.
 559  *
 560  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 561  * \returns             true if the GPU properties passed indicate a compatible
 562  *                      GPU, otherwise false.
 563  */
 564 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 565 {
 566     return (dev_prop->major >= 2);
 567 }
 568
 569 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 570  *
 571  * \param[in] stat  GPU status.
 572  * \returns         true if the provided status is egpuCompatible, otherwise false.
 573  */
 574 static bool is_compatible_gpu(int stat)
 575 {
 576     return (stat == egpuCompatible);
 577 }
 578
 579 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 580  *
 581  *  Returns a status value which indicates compatibility or one of the following
 582  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 583  *  It also returns the respective device's properties in \dev_prop (if applicable).
 584  *
 585  *  \param[in]  dev_id   the ID of the GPU to check.
 586  *  \param[out] dev_prop the CUDA device properties of the device checked.
 587  *  \returns             the status of the requested device
 588  */
 589 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 590 {
 591     cudaError_t stat;
 592     int         ndev;
 593
 594     stat = cudaGetDeviceCount(&ndev);
 595     if (stat != cudaSuccess)
 596     {
 597         return egpuInsane;
 598     }
 599
 600     if (dev_id > ndev - 1)
 601     {
 602         return egpuNonexistent;
 603     }
 604
 605     /* TODO: currently we do not make a distinction between the type of errors
 606      * that can appear during sanity checks. This needs to be improved, e.g if
 607      * the dummy test kernel fails to execute with a "device busy message" we
 608      * should appropriately report that the device is busy instead of insane.
 609      */
 610     if (do_sanity_checks(dev_id, dev_prop) == 0)
 611     {
 612         if (is_gmx_supported_gpu(dev_prop))
 613         {
 614             return egpuCompatible;
 615         }
 616         else
 617         {
 618             return egpuIncompatible;
 619         }
 620     }
 621     else
 622     {
 623         return egpuInsane;
 624     }
 625 }
 626
 627
 628 /*! \brief Detect all NVIDIA GPUs in the system.
 629  *
 630  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 631  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 632  *  with the required information on each the device: ID, device properties,
 633  *  status.
 634  *
 635  *  \param[in] gpu_info    pointer to structure holding GPU information.
 636  *  \param[out] err_str    The error message of any CUDA API error that caused
 637  *                         the detection to fail (if there was any). The memory
 638  *                         the pointer points to should be managed externally.
 639  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 640  */
 641 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 642 {
 643     int             i, ndev, checkres, retval;
 644     cudaError_t     stat;
 645     cudaDeviceProp  prop;
 646     cuda_dev_info_t *devs;
 647
 648     assert(gpu_info);
 649     assert(err_str);
 650
 651     ndev    = 0;
 652     devs    = NULL;
 653
 654     stat = cudaGetDeviceCount(&ndev);
 655     if (stat != cudaSuccess)
 656     {
 657         const char *s;
 658
 659         /* cudaGetDeviceCount failed which means that there is something
 660          * wrong with the machine: driver-runtime mismatch, all GPUs being
 661          * busy in exclusive mode, or some other condition which should
 662          * result in us issuing a warning a falling back to CPUs. */
 663         retval = -1;
 664         s = cudaGetErrorString(stat);
 665         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 666     }
 667     else
 668     {
 669         snew(devs, ndev);
 670         for (i = 0; i < ndev; i++)
 671         {
 672             checkres = is_gmx_supported_gpu_id(i, &prop);
 673
 674             devs[i].id   = i;
 675             devs[i].prop = prop;
 676             devs[i].stat = checkres;
 677         }
 678         retval = 0;
 679     }
 680
 681     gpu_info->ncuda_dev = ndev;
 682     gpu_info->cuda_dev  = devs;
 683
 684     return retval;
 685 }
 686
 687 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 688  *
 689  * This function selects the compatible gpus and initializes
 690  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 691  *
 692  * Given the list of GPUs available in the system the it checks each gpu in
 693  * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
 694  * the compatible ones into cuda_dev_use with this marking the respective
 695  * GPUs as "available for use."
 696  * Note that \detect_cuda_gpus must have been called before.
 697  *
 698  * \param[in]    gpu_info    pointer to structure holding GPU information
 699  */
 700 void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
 701 {
 702     int i, ncompat;
 703     int *compat;
 704
 705     assert(gpu_info);
 706     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 707     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 708
 709     snew(compat, gpu_info->ncuda_dev);
 710     ncompat = 0;
 711     for (i = 0; i < gpu_info->ncuda_dev; i++)
 712     {
 713         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 714         {
 715             ncompat++;
 716             compat[ncompat - 1] = i;
 717         }
 718     }
 719
 720     gpu_info->ncuda_dev_use = ncompat;
 721     snew(gpu_info->cuda_dev_use, ncompat);
 722     memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
 723     sfree(compat);
 724 }
 725
 726 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 727  *
 728  * Given the a list of GPU devide IDs in \requested_devs, check for the
 729  * existence and compatibility of the respective GPUs and fill in \gpu_info
 730  * with the collected information. Also provide the caller with an array with
 731  * the result of checks in \checkres.
 732  *
 733  * \param[out]  checkres    check result for each ID passed in \requested_devs
 734  * \param[in]   gpu_info    pointer to structure holding GPU information
 735  * \param[in]   requested_devs array of requested device IDs
 736  * \param[in]   count       number of IDs in \requested_devs
 737  * \returns                 TRUE if every requested GPU is compatible
 738  */
 739 gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
 740                                 const int *requested_devs, int count)
 741 {
 742     int i, id;
 743     bool bAllOk;
 744
 745     assert(checkres);
 746     assert(gpu_info);
 747     assert(requested_devs);
 748     assert(count >= 0);
 749
 750     if (count == 0)
 751     {
 752         return TRUE;
 753     }
 754
 755     /* we will assume that all GPUs requested are valid IDs,
 756        otherwise we'll bail anyways */
 757     gpu_info->ncuda_dev_use = count;
 758     snew(gpu_info->cuda_dev_use, count);
 759
 760     bAllOk = true;
 761     for (i = 0; i < count; i++)
 762     {
 763         id = requested_devs[i];
 764
 765         /* devices are stored in increasing order of IDs in cuda_dev */
 766         gpu_info->cuda_dev_use[i] = id;
 767
 768         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 769             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 770
 771         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 772     }
 773
 774     return bAllOk;
 775 }
 776
 777 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 778  *
 779  * \param[in]    gpu_info    pointer to structure holding GPU information
 780  */
 781 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 782 {
 783     if (gpu_info == NULL)
 784     {
 785         return;
 786     }
 787
 788     sfree(gpu_info->cuda_dev_use);
 789     sfree(gpu_info->cuda_dev);
 790 }
 791
 792 /*! \brief Formats and returns a device information string for a given GPU.
 793  *
 794  * Given an index *directly* into the array of available GPUs (cuda_dev)
 795  * returns a formatted info string for the respective GPU which includes
 796  * ID, name, compute capability, and detection status.
 797  *
 798  * \param[out]  s           pointer to output string (has to be allocated externally)
 799  * \param[in]   gpu_info    pointer to structure holding GPU information
 800  * \param[in]   index       an index *directly* into the array of available GPUs
 801  */
 802 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 803 {
 804     assert(s);
 805     assert(gpu_info);
 806
 807     if (index < 0 && index >= gpu_info->ncuda_dev)
 808     {
 809         return;
 810     }
 811
 812     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 813
 814     bool bGpuExists =
 815         dinfo->stat == egpuCompatible ||
 816         dinfo->stat == egpuIncompatible;
 817
 818     if (!bGpuExists)
 819     {
 820         sprintf(s, "#%d: %s, stat: %s",
 821                 dinfo->id, "N/A",
 822                 gpu_detect_res_str[dinfo->stat]);
 823     }
 824     else
 825     {
 826         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 827                 dinfo->id, dinfo->prop.name,
 828                 dinfo->prop.major, dinfo->prop.minor,
 829                 dinfo->prop.ECCEnabled ? "yes" : " no",
 830                 gpu_detect_res_str[dinfo->stat]);
 831     }
 832 }
 833
 834 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 835  *
 836  * Getter function which, given an index into the array of GPUs in use
 837  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 838  * respective CUDA GPU.
 839  *
 840  * \param[in]    gpu_info   pointer to structure holding GPU information
 841  * \param[in]    idx        index into the array of used GPUs
 842  * \returns                 device ID of the requested GPU
 843  */
 844 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
 845 {
 846     assert(gpu_info);
 847     if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
 848     {
 849         return -1;
 850     }
 851
 852     return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
 853 }
 854
 855 /*! \brief Returns the device ID of the GPU currently in use.
 856  *
 857  * The GPU used is the one that is active at the time of the call in the active context.
 858  *
 859  * \param[in]    gpu_info   pointer to structure holding GPU information
 860  * \returns                 device ID of the GPU in use at the time of the call
 861  */
 862 int get_current_gpu_device_id(void)
 863 {
 864     int gpuid;
 865     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 866
 867     return gpuid;
 868 }