src/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2010,2012 The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <assert.h>
  42
  43 #include "smalloc.h"
  44 #include "string2.h"
  45 #include "types/hw_info.h"
  46
  47 #include "gpu_utils.h"
  48 #include "../cuda_tools/cudautils.cuh"
  49 #include "memtestG80_core.h"
  50
  51
  52 #define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
  53 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
  54                                                                             to run in quick memtest. */
  55 #define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
  56
  57 #define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
  58 #define FULL_ITER       25 /*!< Number of iterations in full memtest. */
  59
  60 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
  61                                                                             run in time constrained memtest. */
  62
  63 static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
  64                                            In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
  65
  66 /*! Dummy kernel used for sanity checking. */
  67 __global__ void k_dummy_test(){}
  68
  69
  70 /*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
  71 enum memtest_G80_test_types {
  72     MOVING_INVERSIONS_10 =      0x1,
  73     MOVING_INVERSIONS_RAND =    0x2,
  74     WALKING_8BIT_M86 =          0x4,
  75     WALKING_0_8BIT =            0x8,
  76     WALKING_1_8BIT =            0x10,
  77     WALKING_0_32BIT =           0x20,
  78     WALKING_1_32BIT =           0x40,
  79     RANDOM_BLOCKS =             0x80,
  80     MOD_20_32BIT =              0x100,
  81     LOGIC_1_ITER =              0x200,
  82     LOGIC_4_ITER =              0x400,
  83     LOGIC_1_ITER_SHMEM =        0x800,
  84     LOGIC_4_ITER_SHMEM =        0x1000
  85 };
  86
  87
  88 /*!
  89   * \brief Runs GPU sanity checks.
  90   *
  91   * Runs a series of checks to determine that the given GPU and underlying CUDA
  92   * driver/runtime functions properly.
  93   * Returns properties of a device with given ID or the one that has
  94   * already been initialized earlier in the case if of \dev_id == -1.
  95   *
  96   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
  97   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
  98   * \returns                0 if the device looks OK
  99   *
 100   * TODO: introduce errors codes and handle errors more smoothly.
 101   */
 102 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 103 {
 104     cudaError_t cu_err;
 105     int         dev_count, id;
 106
 107     cu_err = cudaGetDeviceCount(&dev_count);
 108     if (cu_err != cudaSuccess)
 109     {
 110        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 111                cudaGetErrorString(cu_err));
 112         return -1;
 113     }
 114
 115     /* no CUDA compatible device at all */
 116     if (dev_count == 0)
 117         return -1;
 118
 119     /* things might go horribly wrong if cudart is not compatible with the driver */
 120     if (dev_count < 0 || dev_count > cuda_max_device_count)
 121         return -1;
 122
 123     if (dev_id == -1) /* device already selected let's not destroy the context */
 124     {
 125         cu_err = cudaGetDevice(&id);
 126         if (cu_err != cudaSuccess)
 127         {
 128             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 129                     cudaGetErrorString(cu_err));
 130             return -1;
 131         }
 132     }
 133     else
 134     {
 135         id = dev_id;
 136         if (id > dev_count - 1) /* pfff there's no such device */
 137         {
 138             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 139                     dev_id, dev_count);
 140             return -1;
 141         }
 142     }
 143
 144     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 145     cu_err = cudaGetDeviceProperties(dev_prop, id);
 146     if (cu_err != cudaSuccess)
 147     {
 148         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 149                 cudaGetErrorString(cu_err));
 150         return -1;
 151     }
 152
 153     /* both major & minor is 9999 if no CUDA capable devices are present */
 154     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 155         return -1;
 156     /* we don't care about emulation mode */
 157     if (dev_prop->major == 0)
 158         return -1;
 159
 160     if (id != -1)
 161     {
 162         cu_err = cudaSetDevice(id);
 163         if (cu_err != cudaSuccess)
 164         {
 165             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 166                     cu_err, id, cudaGetErrorString(cu_err));
 167             return -1;
 168         }
 169     }
 170
 171     /* try to execute a dummy kernel */
 172     k_dummy_test<<<1, 512>>>();
 173     if (cudaThreadSynchronize() != cudaSuccess)
 174     {
 175         return -1;
 176     }
 177
 178     /* destroy context if we created one */
 179     if (id != -1)
 180     {
 181 #if CUDA_VERSION < 4000
 182         cu_err = cudaThreadExit();
 183         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 184 #else
 185         cu_err = cudaDeviceReset();
 186         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 187 #endif
 188     }
 189
 190     return 0;
 191 }
 192
 193
 194 /*!
 195  * \brief Runs a set of memory tests specified by the given bit-flags.
 196  * Tries to allocate and do the test on \p megs Mb memory or
 197  * the greatest amount that can be allocated (>10Mb).
 198  * In case if an error is detected it stops without finishing the remaining
 199  * steps/iterations and returns greater then zero value.
 200  * In case of other errors (e.g. kernel launch errors, device querying errors)
 201  * -1 is returned.
 202  *
 203  * \param[in] which_tests   variable with bit-flags of the requested tests
 204  * \param[in] megs          amount of memory that will be tested in MB
 205  * \param[in] iter          number of iterations
 206  * \returns                 0 if no error was detected, otherwise >0
 207  */
 208 static int do_memtest(unsigned int which_tests, int megs, int iter)
 209 {
 210     memtestState    tester;
 211     int             i;
 212     uint            err_count; //, err_iter;
 213
 214     // no parameter check as this fn won't be called externally
 215
 216     // let's try to allocate the mem
 217     while (!tester.allocate(megs) && (megs - 10 > 0))
 218         { megs -= 10; tester.deallocate(); }
 219
 220     if (megs <= 10)
 221     {
 222         fprintf(stderr, "Unable to allocate GPU memory!\n");
 223         return -1;
 224     }
 225
 226     // clear the first 18 bits
 227     which_tests &= 0x3FFF;
 228     for (i = 0; i < iter; i++)
 229     {
 230         // Moving Inversions (ones and zeros)
 231         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 232         {
 233             tester.gpuMovingInversionsOnesZeros(err_count);
 234             if (err_count > 0)
 235                 return MOVING_INVERSIONS_10;
 236         }
 237         // Moving Inversions (random)
 238         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 239         {
 240             tester.gpuMovingInversionsRandom(err_count);
 241             if (err_count > 0)
 242                 return MOVING_INVERSIONS_RAND;
 243         }
 244        // Memtest86 Walking 8-bit
 245         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 246         {
 247             for (uint shift = 0; shift < 8; shift++)
 248             {
 249                 tester.gpuWalking8BitM86(err_count, shift);
 250                 if (err_count > 0)
 251                     return WALKING_8BIT_M86;
 252             }
 253       }
 254         // True Walking zeros (8-bit)
 255         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 256         {
 257             for (uint shift = 0; shift < 8; shift++)
 258             {
 259                 tester.gpuWalking8Bit(err_count, false, shift);
 260                 if (err_count > 0)
 261                     return WALKING_0_8BIT;
 262             }
 263         }
 264         // True Walking ones (8-bit)
 265         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 266         {
 267             for (uint shift = 0; shift < 8; shift++)
 268             {
 269                 tester.gpuWalking8Bit(err_count, true, shift);
 270                 if (err_count > 0)
 271                     return WALKING_1_8BIT;
 272             }
 273         }
 274         // Memtest86 Walking zeros (32-bit)
 275         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 276         {
 277             for (uint shift = 0; shift < 32; shift++)
 278             {
 279                 tester.gpuWalking32Bit(err_count, false, shift);
 280                 if (err_count > 0)
 281                     return WALKING_0_32BIT;
 282             }
 283         }
 284        // Memtest86 Walking ones (32-bit)
 285         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 286         {
 287             for (uint shift = 0; shift < 32; shift++)
 288             {
 289                 tester.gpuWalking32Bit(err_count, true, shift);
 290                 if (err_count > 0)
 291                     return WALKING_1_32BIT;
 292             }
 293        }
 294         // Random blocks
 295         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 296         {
 297             tester.gpuRandomBlocks(err_count,rand());
 298             if (err_count > 0)
 299                 return RANDOM_BLOCKS;
 300
 301         }
 302
 303         // Memtest86 Modulo-20
 304         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 305         {
 306             for (uint shift = 0; shift < 20; shift++)
 307             {
 308                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 309                 if (err_count > 0)
 310                     return MOD_20_32BIT;
 311             }
 312         }
 313         // Logic (one iteration)
 314         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 315         {
 316             tester.gpuShortLCG0(err_count,1);
 317             if (err_count > 0)
 318                 return LOGIC_1_ITER;
 319         }
 320         // Logic (4 iterations)
 321         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 322         {
 323             tester.gpuShortLCG0(err_count,4);
 324             if (err_count > 0)
 325                 return LOGIC_4_ITER;
 326
 327         }
 328         // Logic (shared memory, one iteration)
 329         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 330         {
 331             tester.gpuShortLCG0Shmem(err_count,1);
 332             if (err_count > 0)
 333                 return LOGIC_1_ITER_SHMEM;
 334         }
 335         // Logic (shared-memory, 4 iterations)
 336         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 337         {
 338             tester.gpuShortLCG0Shmem(err_count,4);
 339             if (err_count > 0)
 340                 return LOGIC_4_ITER_SHMEM;
 341         }
 342     }
 343
 344     tester.deallocate();
 345     return err_count;
 346 }
 347
 348 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 349  * If an error is detected it stops before completing the test and returns a
 350  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 351  * device querying errors) -1 is returned.
 352  *
 353  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 354  * \returns             0 if no error was detected, otherwise >0
 355  */
 356 int do_quick_memtest(int dev_id)
 357 {
 358     cudaDeviceProp  dev_prop;
 359     int             devmem, res, time=0;
 360
 361     if (debug) { time = getTimeMilliseconds(); }
 362
 363     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 364     {
 365         // something went wrong
 366         return -1;
 367     }
 368
 369     if (debug)
 370     {
 371         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 372         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 373             QUICK_MEM, devmem, QUICK_ITER);
 374     }
 375
 376     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 377
 378     if (debug)
 379     {
 380         fprintf(debug, "Q-RES = %d\n", res);
 381         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 382     }
 383
 384     /* destroy context only if we created it */
 385     if (dev_id !=-1) cudaThreadExit();
 386     return res;
 387 }
 388
 389 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 390  * If an error is detected  it stops before completing the test and returns a
 391  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 392  * device querying errors) -1 is returned.
 393  *
 394  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 395  * \returns             0 if no error was detected, otherwise >0
 396  */
 397
 398 int do_full_memtest(int dev_id)
 399 {
 400     cudaDeviceProp  dev_prop;
 401     int             devmem, res, time=0;
 402
 403     if (debug) { time = getTimeMilliseconds(); }
 404
 405     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 406     {
 407         // something went wrong
 408         return -1;
 409     }
 410
 411     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 412
 413     if (debug)
 414     {
 415         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 416             devmem, devmem, FULL_ITER);
 417     }
 418
 419     /* do all test on the entire memory */
 420     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 421
 422     if (debug)
 423     {
 424         fprintf(debug, "F-RES = %d\n", res);
 425         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 426     }
 427
 428     /* destroy context only if we created it */
 429     if (dev_id != -1) cudaThreadExit();
 430     return res;
 431 }
 432
 433 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 434  * If an error is detected it stops before completing the test and returns a value greater
 435  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 436  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 437  * the test will always be multipple of one iteration's runtime.
 438  *
 439  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 440  * \param[in] time_constr   the time limit of the testing
 441  * \returns                 0 if no error was detected, otherwise >0
 442  */
 443 int do_timed_memtest(int dev_id, int time_constr)
 444 {
 445     cudaDeviceProp  dev_prop;
 446     int             devmem, res=0, time=0, startt;
 447
 448     if (debug) { time = getTimeMilliseconds(); }
 449
 450     time_constr *= 1000;  /* convert to ms for convenience */
 451     startt = getTimeMilliseconds();
 452
 453     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 454     {
 455         // something went wrong
 456         return -1;
 457     }
 458
 459     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 460
 461     if (debug)
 462     {
 463         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 464         devmem, devmem, time_constr);
 465     }
 466
 467     /* do the TIMED_TESTS set, one step at a time on the entire memory
 468        that can be allocated, and stop when the given time is exceeded */
 469     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 470     {
 471         res = do_memtest(TIMED_TESTS, devmem, 1);
 472         if (res != 0) break;
 473     }
 474
 475     if (debug)
 476     {
 477         fprintf(debug, "T-RES = %d\n", res);
 478         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 479     }
 480
 481     /* destroy context only if we created it */
 482     if (dev_id != -1) cudaThreadExit();
 483     return res;
 484 }
 485
 486 /*! \brief Initializes the GPU with the given index.
 487  *
 488  * The varible \mygpu is the index of the GPU to initialize in the
 489  * gpu_info.cuda_dev array.
 490  *
 491  * \param[in]  mygpu        index of the GPU to initialize
 492  * \param[out] result_str   the message related to the error that occurred
 493  *                          during the initialization (if there was any).
 494  * \param[in] gpu_info      GPU info of all detected devices in the system.
 495  * \param[in] gpu_opt       options for using the GPUs in gpu_info
 496  * \returns                 true if no error occurs during initialization.
 497  */
 498 gmx_bool init_gpu(int mygpu, char *result_str,
 499                   const gmx_gpu_info_t *gpu_info,
 500                   const gmx_gpu_opt_t *gpu_opt)
 501 {
 502     cudaError_t stat;
 503     char sbuf[STRLEN];
 504     int gpuid;
 505
 506     assert(gpu_info);
 507     assert(result_str);
 508
 509     if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
 510     {
 511         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 512                 "there are %d %s-selected GPU(s), but #%d was requested.",
 513                  gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 514         gmx_incons(sbuf);
 515     }
 516
 517     gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 518
 519     stat = cudaSetDevice(gpuid);
 520     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 521
 522     if (debug)
 523     {
 524         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 525     }
 526
 527     return (stat == cudaSuccess);
 528 }
 529
 530 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 531  *
 532  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 533  * is lost. This should only be called when none of this data is required anymore.
 534  *
 535  * \param[out] result_str   the message related to the error that occurred
 536  *                          during the initialization (if there was any).
 537  * \returns                 true if no error occurs during the freeing.
 538  */
 539 gmx_bool free_gpu(char *result_str)
 540 {
 541     cudaError_t stat;
 542
 543     assert(result_str);
 544
 545     if (debug)
 546     {
 547         int gpuid;
 548         stat = cudaGetDevice(&gpuid);
 549         CU_RET_ERR(stat, "cudaGetDevice failed");
 550         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 551     }
 552
 553 #if CUDA_VERSION < 4000
 554     stat = cudaThreadExit();
 555 #else
 556     stat = cudaDeviceReset();
 557 #endif
 558     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 559
 560     return (stat == cudaSuccess);
 561 }
 562
 563 /*! \brief Returns true if the gpu characterized by the device properties is
 564  *  supported by the native gpu acceleration.
 565  *
 566  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 567  * \returns             true if the GPU properties passed indicate a compatible
 568  *                      GPU, otherwise false.
 569  */
 570 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 571 {
 572     return (dev_prop->major >= 2);
 573 }
 574
 575 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 576  *
 577  * \param[in] stat  GPU status.
 578  * \returns         true if the provided status is egpuCompatible, otherwise false.
 579  */
 580 static bool is_compatible_gpu(int stat)
 581 {
 582     return (stat == egpuCompatible);
 583 }
 584
 585 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 586  *
 587  *  Returns a status value which indicates compatibility or one of the following
 588  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 589  *  It also returns the respective device's properties in \dev_prop (if applicable).
 590  *
 591  *  \param[in]  dev_id   the ID of the GPU to check.
 592  *  \param[out] dev_prop the CUDA device properties of the device checked.
 593  *  \returns             the status of the requested device
 594  */
 595 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 596 {
 597     cudaError_t stat;
 598     int         ndev;
 599
 600     stat = cudaGetDeviceCount(&ndev);
 601     if (stat != cudaSuccess)
 602     {
 603         return egpuInsane;
 604     }
 605
 606     if (dev_id > ndev - 1)
 607     {
 608         return egpuNonexistent;
 609     }
 610
 611     /* TODO: currently we do not make a distinction between the type of errors
 612      * that can appear during sanity checks. This needs to be improved, e.g if
 613      * the dummy test kernel fails to execute with a "device busy message" we
 614      * should appropriately report that the device is busy instead of insane.
 615      */
 616     if (do_sanity_checks(dev_id, dev_prop) == 0)
 617     {
 618         if (is_gmx_supported_gpu(dev_prop))
 619         {
 620             return egpuCompatible;
 621         }
 622         else
 623         {
 624             return egpuIncompatible;
 625         }
 626     }
 627     else
 628     {
 629         return egpuInsane;
 630     }
 631 }
 632
 633
 634 /*! \brief Detect all NVIDIA GPUs in the system.
 635  *
 636  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 637  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 638  *  with the required information on each the device: ID, device properties,
 639  *  status.
 640  *
 641  *  \param[in] gpu_info    pointer to structure holding GPU information.
 642  *  \param[out] err_str    The error message of any CUDA API error that caused
 643  *                         the detection to fail (if there was any). The memory
 644  *                         the pointer points to should be managed externally.
 645  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 646  */
 647 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 648 {
 649     int             i, ndev, checkres, retval;
 650     cudaError_t     stat;
 651     cudaDeviceProp  prop;
 652     cuda_dev_info_t *devs;
 653
 654     assert(gpu_info);
 655     assert(err_str);
 656
 657     gpu_info->ncuda_dev_compatible = 0;
 658
 659     ndev    = 0;
 660     devs    = NULL;
 661
 662     stat = cudaGetDeviceCount(&ndev);
 663     if (stat != cudaSuccess)
 664     {
 665         const char *s;
 666
 667         /* cudaGetDeviceCount failed which means that there is something
 668          * wrong with the machine: driver-runtime mismatch, all GPUs being
 669          * busy in exclusive mode, or some other condition which should
 670          * result in us issuing a warning a falling back to CPUs. */
 671         retval = -1;
 672         s = cudaGetErrorString(stat);
 673         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 674     }
 675     else
 676     {
 677         snew(devs, ndev);
 678         for (i = 0; i < ndev; i++)
 679         {
 680             checkres = is_gmx_supported_gpu_id(i, &prop);
 681
 682             devs[i].id   = i;
 683             devs[i].prop = prop;
 684             devs[i].stat = checkres;
 685
 686             if (checkres == egpuCompatible)
 687             {
 688                 gpu_info->ncuda_dev_compatible++;
 689             }
 690         }
 691         retval = 0;
 692     }
 693
 694     gpu_info->ncuda_dev = ndev;
 695     gpu_info->cuda_dev  = devs;
 696
 697     return retval;
 698 }
 699
 700 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 701  *
 702  * This function selects the compatible gpus and initializes
 703  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 704  *
 705  * Given the list of GPUs available in the system check each device in
 706  * gpu_info->cuda_dev and place the indices of the compatible GPUs into
 707  * cuda_dev_use with this marking the respective GPUs as "available for use."
 708  * Note that \detect_cuda_gpus must have been called before.
 709  *
 710  * \param[in]     gpu_info    pointer to structure holding GPU information
 711  * \param[in,out] gpu_opt     pointer to structure holding GPU options
 712  */
 713 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 714                           gmx_gpu_opt_t *gpu_opt)
 715 {
 716     int i, ncompat;
 717     int *compat;
 718
 719     assert(gpu_info);
 720     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 721     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 722
 723     snew(compat, gpu_info->ncuda_dev);
 724     ncompat = 0;
 725     for (i = 0; i < gpu_info->ncuda_dev; i++)
 726     {
 727         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 728         {
 729             ncompat++;
 730             compat[ncompat - 1] = i;
 731         }
 732     }
 733
 734     gpu_opt->ncuda_dev_use = ncompat;
 735     snew(gpu_opt->cuda_dev_use, ncompat);
 736     memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
 737     sfree(compat);
 738 }
 739
 740 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 741  *
 742  * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
 743  * gpu_opt->cuda_dev_use check the existence and compatibility
 744  * of the respective GPUs. Also provide the caller with an array containing
 745  * the result of checks in \checkres.
 746  *
 747  * \param[out]  checkres    check result for each ID passed in \requested_devs
 748  * \param[in]   gpu_info    pointer to structure holding GPU information
 749  * \param[out]  gpu_opt     pointer to structure holding GPU options
 750  * \returns                 TRUE if every the requested GPUs are compatible
 751  */
 752 gmx_bool check_selected_cuda_gpus(int *checkres,
 753                                   const gmx_gpu_info_t *gpu_info,
 754                                   gmx_gpu_opt_t *gpu_opt)
 755 {
 756     int i, id;
 757     bool bAllOk;
 758
 759     assert(checkres);
 760     assert(gpu_info);
 761     assert(gpu_opt->ncuda_dev_use >= 0);
 762
 763     if (gpu_opt->ncuda_dev_use == 0)
 764     {
 765         return TRUE;
 766     }
 767
 768     assert(gpu_opt->cuda_dev_use);
 769
 770     /* we will assume that all GPUs requested are valid IDs,
 771        otherwise we'll bail anyways */
 772
 773     bAllOk = true;
 774     for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
 775     {
 776         id = gpu_opt->cuda_dev_use[i];
 777
 778         /* devices are stored in increasing order of IDs in cuda_dev */
 779         gpu_opt->cuda_dev_use[i] = id;
 780
 781         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 782             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 783
 784         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 785     }
 786
 787     return bAllOk;
 788 }
 789
 790 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 791  *
 792  * \param[in]    gpu_info    pointer to structure holding GPU information
 793  */
 794 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 795 {
 796     if (gpu_info == NULL)
 797     {
 798         return;
 799     }
 800
 801     sfree(gpu_info->cuda_dev);
 802 }
 803
 804 /*! \brief Formats and returns a device information string for a given GPU.
 805  *
 806  * Given an index *directly* into the array of available GPUs (cuda_dev)
 807  * returns a formatted info string for the respective GPU which includes
 808  * ID, name, compute capability, and detection status.
 809  *
 810  * \param[out]  s           pointer to output string (has to be allocated externally)
 811  * \param[in]   gpu_info    pointer to structure holding GPU information
 812  * \param[in]   index       an index *directly* into the array of available GPUs
 813  */
 814 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 815 {
 816     assert(s);
 817     assert(gpu_info);
 818
 819     if (index < 0 && index >= gpu_info->ncuda_dev)
 820     {
 821         return;
 822     }
 823
 824     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 825
 826     bool bGpuExists =
 827         dinfo->stat == egpuCompatible ||
 828         dinfo->stat == egpuIncompatible;
 829
 830     if (!bGpuExists)
 831     {
 832         sprintf(s, "#%d: %s, stat: %s",
 833                 dinfo->id, "N/A",
 834                 gpu_detect_res_str[dinfo->stat]);
 835     }
 836     else
 837     {
 838         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 839                 dinfo->id, dinfo->prop.name,
 840                 dinfo->prop.major, dinfo->prop.minor,
 841                 dinfo->prop.ECCEnabled ? "yes" : " no",
 842                 gpu_detect_res_str[dinfo->stat]);
 843     }
 844 }
 845
 846 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 847  *
 848  * Getter function which, given an index into the array of GPUs in use
 849  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 850  * respective CUDA GPU.
 851  *
 852  * \param[in]    gpu_info   pointer to structure holding GPU information
 853  * \param[in]    gpu_opt    pointer to structure holding GPU options
 854  * \param[in]    idx        index into the array of used GPUs
 855  * \returns                 device ID of the requested GPU
 856  */
 857 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 858                       const gmx_gpu_opt_t *gpu_opt,
 859                       int idx)
 860 {
 861     assert(gpu_info);
 862     assert(gpu_opt);
 863     if (idx < 0 && idx >= gpu_opt->ncuda_dev_use)
 864     {
 865         return -1;
 866     }
 867
 868     return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 869 }
 870
 871 /*! \brief Returns the device ID of the GPU currently in use.
 872  *
 873  * The GPU used is the one that is active at the time of the call in the active context.
 874  *
 875  * \param[in]    gpu_info   pointer to structure holding GPU information
 876  * \returns                 device ID of the GPU in use at the time of the call
 877  */
 878 int get_current_gpu_device_id(void)
 879 {
 880     int gpuid;
 881     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 882
 883     return gpuid;
 884 }