src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "smalloc.h"
  41 #include "string2.h"
  42 #include "types/hw_info.h"
  43
  44 #include "gpu_utils.h"
  45 #include "../cuda_tools/cudautils.cuh"
  46 #include "memtestG80_core.h"
  47
  48 /** Amount of memory to be used in quick memtest. */
  49 #define QUICK_MEM       250
  50 /** Bit flag with type of tests to run in quick memtest. */
  51 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  52 /** Number of iterations in quick memtest. */
  53 #define QUICK_ITER      3
  54
  55 /** Bitflag with all test set on for full memetest. */
  56 #define FULL_TESTS      0x3FFF
  57 /** Number of iterations in full memtest. */
  58 #define FULL_ITER       25
  59
  60 /** Bit flag with type of tests to run in time constrained memtest. */
  61 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  62
  63 /*! \brief
  64  * Max number of devices supported by CUDA (for consistency checking).
  65  *
  66  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  67  */
  68 static int cuda_max_device_count = 32;
  69
  70 /** Dummy kernel used for sanity checking. */
  71 __global__ void k_dummy_test()
  72 {
  73 }
  74
  75
  76 /** Bit-flags which refer to memtestG80 test types and are used in do_memtest
  77  * to specify which tests to run. */
  78 enum memtest_G80_test_types {
  79     MOVING_INVERSIONS_10   = 0x1,
  80     MOVING_INVERSIONS_RAND = 0x2,
  81     WALKING_8BIT_M86       = 0x4,
  82     WALKING_0_8BIT         = 0x8,
  83     WALKING_1_8BIT         = 0x10,
  84     WALKING_0_32BIT        = 0x20,
  85     WALKING_1_32BIT        = 0x40,
  86     RANDOM_BLOCKS          = 0x80,
  87     MOD_20_32BIT           = 0x100,
  88     LOGIC_1_ITER           = 0x200,
  89     LOGIC_4_ITER           = 0x400,
  90     LOGIC_1_ITER_SHMEM     = 0x800,
  91     LOGIC_4_ITER_SHMEM     = 0x1000
  92 };
  93
  94
  95 /*!
  96  * \brief Runs GPU sanity checks.
  97  *
  98  * Runs a series of checks to determine that the given GPU and underlying CUDA
  99  * driver/runtime functions properly.
 100  * Returns properties of a device with given ID or the one that has
 101  * already been initialized earlier in the case if of \dev_id == -1.
 102  *
 103  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 104  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 105  * \returns                0 if the device looks OK
 106  *
 107  * TODO: introduce errors codes and handle errors more smoothly.
 108  */
 109 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 110 {
 111     cudaError_t cu_err;
 112     int         dev_count, id;
 113
 114     cu_err = cudaGetDeviceCount(&dev_count);
 115     if (cu_err != cudaSuccess)
 116     {
 117         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 118                 cudaGetErrorString(cu_err));
 119         return -1;
 120     }
 121
 122     /* no CUDA compatible device at all */
 123     if (dev_count == 0)
 124     {
 125         return -1;
 126     }
 127
 128     /* things might go horribly wrong if cudart is not compatible with the driver */
 129     if (dev_count < 0 || dev_count > cuda_max_device_count)
 130     {
 131         return -1;
 132     }
 133
 134     if (dev_id == -1) /* device already selected let's not destroy the context */
 135     {
 136         cu_err = cudaGetDevice(&id);
 137         if (cu_err != cudaSuccess)
 138         {
 139             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 140                     cudaGetErrorString(cu_err));
 141             return -1;
 142         }
 143     }
 144     else
 145     {
 146         id = dev_id;
 147         if (id > dev_count - 1) /* pfff there's no such device */
 148         {
 149             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 150                     dev_id, dev_count);
 151             return -1;
 152         }
 153     }
 154
 155     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 156     cu_err = cudaGetDeviceProperties(dev_prop, id);
 157     if (cu_err != cudaSuccess)
 158     {
 159         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 160                 cudaGetErrorString(cu_err));
 161         return -1;
 162     }
 163
 164     /* both major & minor is 9999 if no CUDA capable devices are present */
 165     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 166     {
 167         return -1;
 168     }
 169     /* we don't care about emulation mode */
 170     if (dev_prop->major == 0)
 171     {
 172         return -1;
 173     }
 174
 175     if (id != -1)
 176     {
 177         cu_err = cudaSetDevice(id);
 178         if (cu_err != cudaSuccess)
 179         {
 180             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 181                     cu_err, id, cudaGetErrorString(cu_err));
 182             return -1;
 183         }
 184     }
 185
 186     /* try to execute a dummy kernel */
 187     k_dummy_test<<< 1, 512>>> ();
 188     if (cudaThreadSynchronize() != cudaSuccess)
 189     {
 190         return -1;
 191     }
 192
 193     /* destroy context if we created one */
 194     if (id != -1)
 195     {
 196 #if CUDA_VERSION < 4000
 197         cu_err = cudaThreadExit();
 198         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 199 #else
 200         cu_err = cudaDeviceReset();
 201         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 202 #endif
 203     }
 204
 205     return 0;
 206 }
 207
 208
 209 /*!
 210  * \brief Runs a set of memory tests specified by the given bit-flags.
 211  * Tries to allocate and do the test on \p megs Mb memory or
 212  * the greatest amount that can be allocated (>10Mb).
 213  * In case if an error is detected it stops without finishing the remaining
 214  * steps/iterations and returns greater then zero value.
 215  * In case of other errors (e.g. kernel launch errors, device querying errors)
 216  * -1 is returned.
 217  *
 218  * \param[in] which_tests   variable with bit-flags of the requested tests
 219  * \param[in] megs          amount of memory that will be tested in MB
 220  * \param[in] iter          number of iterations
 221  * \returns                 0 if no error was detected, otherwise >0
 222  */
 223 static int do_memtest(unsigned int which_tests, int megs, int iter)
 224 {
 225     memtestState    tester;
 226     int             i;
 227     uint            err_count; //, err_iter;
 228
 229     // no parameter check as this fn won't be called externally
 230
 231     // let's try to allocate the mem
 232     while (!tester.allocate(megs) && (megs - 10 > 0))
 233     {
 234         megs -= 10; tester.deallocate();
 235     }
 236
 237     if (megs <= 10)
 238     {
 239         fprintf(stderr, "Unable to allocate GPU memory!\n");
 240         return -1;
 241     }
 242
 243     // clear the first 18 bits
 244     which_tests &= 0x3FFF;
 245     for (i = 0; i < iter; i++)
 246     {
 247         // Moving Inversions (ones and zeros)
 248         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 249         {
 250             tester.gpuMovingInversionsOnesZeros(err_count);
 251             if (err_count > 0)
 252             {
 253                 return MOVING_INVERSIONS_10;
 254             }
 255         }
 256         // Moving Inversions (random)
 257         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 258         {
 259             tester.gpuMovingInversionsRandom(err_count);
 260             if (err_count > 0)
 261             {
 262                 return MOVING_INVERSIONS_RAND;
 263             }
 264         }
 265         // Memtest86 Walking 8-bit
 266         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 267         {
 268             for (uint shift = 0; shift < 8; shift++)
 269             {
 270                 tester.gpuWalking8BitM86(err_count, shift);
 271                 if (err_count > 0)
 272                 {
 273                     return WALKING_8BIT_M86;
 274                 }
 275             }
 276         }
 277         // True Walking zeros (8-bit)
 278         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 279         {
 280             for (uint shift = 0; shift < 8; shift++)
 281             {
 282                 tester.gpuWalking8Bit(err_count, false, shift);
 283                 if (err_count > 0)
 284                 {
 285                     return WALKING_0_8BIT;
 286                 }
 287             }
 288         }
 289         // True Walking ones (8-bit)
 290         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 291         {
 292             for (uint shift = 0; shift < 8; shift++)
 293             {
 294                 tester.gpuWalking8Bit(err_count, true, shift);
 295                 if (err_count > 0)
 296                 {
 297                     return WALKING_1_8BIT;
 298                 }
 299             }
 300         }
 301         // Memtest86 Walking zeros (32-bit)
 302         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 303         {
 304             for (uint shift = 0; shift < 32; shift++)
 305             {
 306                 tester.gpuWalking32Bit(err_count, false, shift);
 307                 if (err_count > 0)
 308                 {
 309                     return WALKING_0_32BIT;
 310                 }
 311             }
 312         }
 313         // Memtest86 Walking ones (32-bit)
 314         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 315         {
 316             for (uint shift = 0; shift < 32; shift++)
 317             {
 318                 tester.gpuWalking32Bit(err_count, true, shift);
 319                 if (err_count > 0)
 320                 {
 321                     return WALKING_1_32BIT;
 322                 }
 323             }
 324         }
 325         // Random blocks
 326         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 327         {
 328             tester.gpuRandomBlocks(err_count, rand());
 329             if (err_count > 0)
 330             {
 331                 return RANDOM_BLOCKS;
 332             }
 333
 334         }
 335
 336         // Memtest86 Modulo-20
 337         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 338         {
 339             for (uint shift = 0; shift < 20; shift++)
 340             {
 341                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 342                 if (err_count > 0)
 343                 {
 344                     return MOD_20_32BIT;
 345                 }
 346             }
 347         }
 348         // Logic (one iteration)
 349         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 350         {
 351             tester.gpuShortLCG0(err_count, 1);
 352             if (err_count > 0)
 353             {
 354                 return LOGIC_1_ITER;
 355             }
 356         }
 357         // Logic (4 iterations)
 358         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 359         {
 360             tester.gpuShortLCG0(err_count, 4);
 361             if (err_count > 0)
 362             {
 363                 return LOGIC_4_ITER;
 364             }
 365
 366         }
 367         // Logic (shared memory, one iteration)
 368         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 369         {
 370             tester.gpuShortLCG0Shmem(err_count, 1);
 371             if (err_count > 0)
 372             {
 373                 return LOGIC_1_ITER_SHMEM;
 374             }
 375         }
 376         // Logic (shared-memory, 4 iterations)
 377         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 378         {
 379             tester.gpuShortLCG0Shmem(err_count, 4);
 380             if (err_count > 0)
 381             {
 382                 return LOGIC_4_ITER_SHMEM;
 383             }
 384         }
 385     }
 386
 387     tester.deallocate();
 388     return err_count;
 389 }
 390
 391 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 392  * If an error is detected it stops before completing the test and returns a
 393  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 394  * device querying errors) -1 is returned.
 395  *
 396  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 397  * \returns             0 if no error was detected, otherwise >0
 398  */
 399 int do_quick_memtest(int dev_id)
 400 {
 401     cudaDeviceProp  dev_prop;
 402     int             devmem, res, time = 0;
 403
 404     if (debug)
 405     {
 406         time = getTimeMilliseconds();
 407     }
 408
 409     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 410     {
 411         // something went wrong
 412         return -1;
 413     }
 414
 415     if (debug)
 416     {
 417         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 418         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 419                 QUICK_MEM, devmem, QUICK_ITER);
 420     }
 421
 422     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 423
 424     if (debug)
 425     {
 426         fprintf(debug, "Q-RES = %d\n", res);
 427         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 428     }
 429
 430     /* destroy context only if we created it */
 431     if (dev_id != -1)
 432     {
 433         cudaThreadExit();
 434     }
 435     return res;
 436 }
 437
 438 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 439  * If an error is detected  it stops before completing the test and returns a
 440  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 441  * device querying errors) -1 is returned.
 442  *
 443  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 444  * \returns             0 if no error was detected, otherwise >0
 445  */
 446
 447 int do_full_memtest(int dev_id)
 448 {
 449     cudaDeviceProp  dev_prop;
 450     int             devmem, res, time = 0;
 451
 452     if (debug)
 453     {
 454         time = getTimeMilliseconds();
 455     }
 456
 457     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 458     {
 459         // something went wrong
 460         return -1;
 461     }
 462
 463     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 464
 465     if (debug)
 466     {
 467         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 468                 devmem, devmem, FULL_ITER);
 469     }
 470
 471     /* do all test on the entire memory */
 472     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 473
 474     if (debug)
 475     {
 476         fprintf(debug, "F-RES = %d\n", res);
 477         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 478     }
 479
 480     /* destroy context only if we created it */
 481     if (dev_id != -1)
 482     {
 483         cudaThreadExit();
 484     }
 485     return res;
 486 }
 487
 488 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 489  * If an error is detected it stops before completing the test and returns a value greater
 490  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 491  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 492  * the test will always be multipple of one iteration's runtime.
 493  *
 494  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 495  * \param[in] time_constr   the time limit of the testing
 496  * \returns                 0 if no error was detected, otherwise >0
 497  */
 498 int do_timed_memtest(int dev_id, int time_constr)
 499 {
 500     cudaDeviceProp  dev_prop;
 501     int             devmem, res = 0, time = 0, startt;
 502
 503     if (debug)
 504     {
 505         time = getTimeMilliseconds();
 506     }
 507
 508     time_constr *= 1000;  /* convert to ms for convenience */
 509     startt       = getTimeMilliseconds();
 510
 511     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 512     {
 513         // something went wrong
 514         return -1;
 515     }
 516
 517     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 518
 519     if (debug)
 520     {
 521         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 522                 devmem, devmem, time_constr);
 523     }
 524
 525     /* do the TIMED_TESTS set, one step at a time on the entire memory
 526        that can be allocated, and stop when the given time is exceeded */
 527     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 528     {
 529         res = do_memtest(TIMED_TESTS, devmem, 1);
 530         if (res != 0)
 531         {
 532             break;
 533         }
 534     }
 535
 536     if (debug)
 537     {
 538         fprintf(debug, "T-RES = %d\n", res);
 539         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 540     }
 541
 542     /* destroy context only if we created it */
 543     if (dev_id != -1)
 544     {
 545         cudaThreadExit();
 546     }
 547     return res;
 548 }
 549
 550 /*! \brief Initializes the GPU with the given index.
 551  *
 552  * The varible \mygpu is the index of the GPU to initialize in the
 553  * gpu_info.cuda_dev array.
 554  *
 555  * \param[in]  mygpu        index of the GPU to initialize
 556  * \param[out] result_str   the message related to the error that occurred
 557  *                          during the initialization (if there was any).
 558  * \param[in] gpu_info      GPU info of all detected devices in the system.
 559  * \param[in] gpu_opt       options for using the GPUs in gpu_info
 560  * \returns                 true if no error occurs during initialization.
 561  */
 562 gmx_bool init_gpu(int mygpu, char *result_str,
 563                   const gmx_gpu_info_t *gpu_info,
 564                   const gmx_gpu_opt_t *gpu_opt)
 565 {
 566     cudaError_t stat;
 567     char        sbuf[STRLEN];
 568     int         gpuid;
 569
 570     assert(gpu_info);
 571     assert(result_str);
 572
 573     if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
 574     {
 575         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 576                 "there are %d %s-selected GPU(s), but #%d was requested.",
 577                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 578         gmx_incons(sbuf);
 579     }
 580
 581     gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 582
 583     stat = cudaSetDevice(gpuid);
 584     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 585
 586     if (debug)
 587     {
 588         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 589     }
 590
 591     return (stat == cudaSuccess);
 592 }
 593
 594 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 595  *
 596  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 597  * is lost. This should only be called when none of this data is required anymore.
 598  *
 599  * \param[out] result_str   the message related to the error that occurred
 600  *                          during the initialization (if there was any).
 601  * \returns                 true if no error occurs during the freeing.
 602  */
 603 gmx_bool free_gpu(char *result_str)
 604 {
 605     cudaError_t stat;
 606
 607     assert(result_str);
 608
 609     if (debug)
 610     {
 611         int gpuid;
 612         stat = cudaGetDevice(&gpuid);
 613         CU_RET_ERR(stat, "cudaGetDevice failed");
 614         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 615     }
 616
 617 #if CUDA_VERSION < 4000
 618     stat = cudaThreadExit();
 619 #else
 620     stat = cudaDeviceReset();
 621 #endif
 622     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 623
 624     return (stat == cudaSuccess);
 625 }
 626
 627 /*! \brief Returns true if the gpu characterized by the device properties is
 628  *  supported by the native gpu acceleration.
 629  *
 630  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 631  * \returns             true if the GPU properties passed indicate a compatible
 632  *                      GPU, otherwise false.
 633  */
 634 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 635 {
 636     return (dev_prop->major >= 2);
 637 }
 638
 639 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 640  *
 641  * \param[in] stat  GPU status.
 642  * \returns         true if the provided status is egpuCompatible, otherwise false.
 643  */
 644 static bool is_compatible_gpu(int stat)
 645 {
 646     return (stat == egpuCompatible);
 647 }
 648
 649 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 650  *
 651  *  Returns a status value which indicates compatibility or one of the following
 652  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 653  *  It also returns the respective device's properties in \dev_prop (if applicable).
 654  *
 655  *  \param[in]  dev_id   the ID of the GPU to check.
 656  *  \param[out] dev_prop the CUDA device properties of the device checked.
 657  *  \returns             the status of the requested device
 658  */
 659 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 660 {
 661     cudaError_t stat;
 662     int         ndev;
 663
 664     stat = cudaGetDeviceCount(&ndev);
 665     if (stat != cudaSuccess)
 666     {
 667         return egpuInsane;
 668     }
 669
 670     if (dev_id > ndev - 1)
 671     {
 672         return egpuNonexistent;
 673     }
 674
 675     /* TODO: currently we do not make a distinction between the type of errors
 676      * that can appear during sanity checks. This needs to be improved, e.g if
 677      * the dummy test kernel fails to execute with a "device busy message" we
 678      * should appropriately report that the device is busy instead of insane.
 679      */
 680     if (do_sanity_checks(dev_id, dev_prop) == 0)
 681     {
 682         if (is_gmx_supported_gpu(dev_prop))
 683         {
 684             return egpuCompatible;
 685         }
 686         else
 687         {
 688             return egpuIncompatible;
 689         }
 690     }
 691     else
 692     {
 693         return egpuInsane;
 694     }
 695 }
 696
 697
 698 /*! \brief Detect all NVIDIA GPUs in the system.
 699  *
 700  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 701  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 702  *  with the required information on each the device: ID, device properties,
 703  *  status.
 704  *
 705  *  \param[in] gpu_info    pointer to structure holding GPU information.
 706  *  \param[out] err_str    The error message of any CUDA API error that caused
 707  *                         the detection to fail (if there was any). The memory
 708  *                         the pointer points to should be managed externally.
 709  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 710  */
 711 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 712 {
 713     int              i, ndev, checkres, retval;
 714     cudaError_t      stat;
 715     cudaDeviceProp   prop;
 716     cuda_dev_info_t *devs;
 717
 718     assert(gpu_info);
 719     assert(err_str);
 720
 721     gpu_info->ncuda_dev_compatible = 0;
 722
 723     ndev    = 0;
 724     devs    = NULL;
 725
 726     stat = cudaGetDeviceCount(&ndev);
 727     if (stat != cudaSuccess)
 728     {
 729         const char *s;
 730
 731         /* cudaGetDeviceCount failed which means that there is something
 732          * wrong with the machine: driver-runtime mismatch, all GPUs being
 733          * busy in exclusive mode, or some other condition which should
 734          * result in us issuing a warning a falling back to CPUs. */
 735         retval = -1;
 736         s      = cudaGetErrorString(stat);
 737         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 738     }
 739     else
 740     {
 741         snew(devs, ndev);
 742         for (i = 0; i < ndev; i++)
 743         {
 744             checkres = is_gmx_supported_gpu_id(i, &prop);
 745
 746             devs[i].id   = i;
 747             devs[i].prop = prop;
 748             devs[i].stat = checkres;
 749
 750             if (checkres == egpuCompatible)
 751             {
 752                 gpu_info->ncuda_dev_compatible++;
 753             }
 754         }
 755         retval = 0;
 756     }
 757
 758     gpu_info->ncuda_dev = ndev;
 759     gpu_info->cuda_dev  = devs;
 760
 761     return retval;
 762 }
 763
 764 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 765  *
 766  * This function selects the compatible gpus and initializes
 767  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 768  *
 769  * Given the list of GPUs available in the system check each device in
 770  * gpu_info->cuda_dev and place the indices of the compatible GPUs into
 771  * cuda_dev_use with this marking the respective GPUs as "available for use."
 772  * Note that \detect_cuda_gpus must have been called before.
 773  *
 774  * \param[in]     gpu_info    pointer to structure holding GPU information
 775  * \param[in,out] gpu_opt     pointer to structure holding GPU options
 776  */
 777 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 778                           gmx_gpu_opt_t        *gpu_opt)
 779 {
 780     int  i, ncompat;
 781     int *compat;
 782
 783     assert(gpu_info);
 784     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 785     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 786
 787     snew(compat, gpu_info->ncuda_dev);
 788     ncompat = 0;
 789     for (i = 0; i < gpu_info->ncuda_dev; i++)
 790     {
 791         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 792         {
 793             ncompat++;
 794             compat[ncompat - 1] = i;
 795         }
 796     }
 797
 798     gpu_opt->ncuda_dev_use = ncompat;
 799     snew(gpu_opt->cuda_dev_use, ncompat);
 800     memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
 801     sfree(compat);
 802 }
 803
 804 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 805  *
 806  * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
 807  * gpu_opt->cuda_dev_use check the existence and compatibility
 808  * of the respective GPUs. Also provide the caller with an array containing
 809  * the result of checks in \checkres.
 810  *
 811  * \param[out]  checkres    check result for each ID passed in \requested_devs
 812  * \param[in]   gpu_info    pointer to structure holding GPU information
 813  * \param[out]  gpu_opt     pointer to structure holding GPU options
 814  * \returns                 TRUE if every the requested GPUs are compatible
 815  */
 816 gmx_bool check_selected_cuda_gpus(int                  *checkres,
 817                                   const gmx_gpu_info_t *gpu_info,
 818                                   gmx_gpu_opt_t        *gpu_opt)
 819 {
 820     int  i, id;
 821     bool bAllOk;
 822
 823     assert(checkres);
 824     assert(gpu_info);
 825     assert(gpu_opt->ncuda_dev_use >= 0);
 826
 827     if (gpu_opt->ncuda_dev_use == 0)
 828     {
 829         return TRUE;
 830     }
 831
 832     assert(gpu_opt->cuda_dev_use);
 833
 834     /* we will assume that all GPUs requested are valid IDs,
 835        otherwise we'll bail anyways */
 836
 837     bAllOk = true;
 838     for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
 839     {
 840         id = gpu_opt->cuda_dev_use[i];
 841
 842         /* devices are stored in increasing order of IDs in cuda_dev */
 843         gpu_opt->cuda_dev_use[i] = id;
 844
 845         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 846             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 847
 848         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 849     }
 850
 851     return bAllOk;
 852 }
 853
 854 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 855  *
 856  * \param[in]    gpu_info    pointer to structure holding GPU information
 857  */
 858 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 859 {
 860     if (gpu_info == NULL)
 861     {
 862         return;
 863     }
 864
 865     sfree(gpu_info->cuda_dev);
 866 }
 867
 868 /*! \brief Formats and returns a device information string for a given GPU.
 869  *
 870  * Given an index *directly* into the array of available GPUs (cuda_dev)
 871  * returns a formatted info string for the respective GPU which includes
 872  * ID, name, compute capability, and detection status.
 873  *
 874  * \param[out]  s           pointer to output string (has to be allocated externally)
 875  * \param[in]   gpu_info    pointer to structure holding GPU information
 876  * \param[in]   index       an index *directly* into the array of available GPUs
 877  */
 878 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 879 {
 880     assert(s);
 881     assert(gpu_info);
 882
 883     if (index < 0 && index >= gpu_info->ncuda_dev)
 884     {
 885         return;
 886     }
 887
 888     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 889
 890     bool             bGpuExists =
 891         dinfo->stat == egpuCompatible ||
 892         dinfo->stat == egpuIncompatible;
 893
 894     if (!bGpuExists)
 895     {
 896         sprintf(s, "#%d: %s, stat: %s",
 897                 dinfo->id, "N/A",
 898                 gpu_detect_res_str[dinfo->stat]);
 899     }
 900     else
 901     {
 902         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 903                 dinfo->id, dinfo->prop.name,
 904                 dinfo->prop.major, dinfo->prop.minor,
 905                 dinfo->prop.ECCEnabled ? "yes" : " no",
 906                 gpu_detect_res_str[dinfo->stat]);
 907     }
 908 }
 909
 910 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 911  *
 912  * Getter function which, given an index into the array of GPUs in use
 913  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 914  * respective CUDA GPU.
 915  *
 916  * \param[in]    gpu_info   pointer to structure holding GPU information
 917  * \param[in]    gpu_opt    pointer to structure holding GPU options
 918  * \param[in]    idx        index into the array of used GPUs
 919  * \returns                 device ID of the requested GPU
 920  */
 921 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 922                       const gmx_gpu_opt_t  *gpu_opt,
 923                       int                   idx)
 924 {
 925     assert(gpu_info);
 926     assert(gpu_opt);
 927     assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
 928
 929     return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 930 }
 931
 932 /*! \brief Returns the device ID of the GPU currently in use.
 933  *
 934  * The GPU used is the one that is active at the time of the call in the active context.
 935  *
 936  * \param[in]    gpu_info   pointer to structure holding GPU information
 937  * \returns                 device ID of the GPU in use at the time of the call
 938  */
 939 int get_current_gpu_device_id(void)
 940 {
 941     int gpuid;
 942     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 943
 944     return gpuid;
 945 }
 946
 947 /*! \brief Returns the size of the cuda_dev_info struct.
 948  *
 949  * The size of cuda_dev_info can be used for allocation and communication.
 950  *
 951  * \returns                 size in bytes of cuda_dev_info
 952  */
 953 size_t sizeof_cuda_dev_info(void)
 954 {
 955     return sizeof(cuda_dev_info);
 956 }