src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "types/hw_info.h"
  41
  42 #include "gpu_utils.h"
  43 #include "../cuda_tools/cudautils.cuh"
  44 #include "memtestG80_core.h"
  45
  46 #include "gromacs/utility/cstringutil.h"
  47 #include "gromacs/utility/smalloc.h"
  48
  49 /** Amount of memory to be used in quick memtest. */
  50 #define QUICK_MEM       250
  51 /** Bit flag with type of tests to run in quick memtest. */
  52 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  53 /** Number of iterations in quick memtest. */
  54 #define QUICK_ITER      3
  55
  56 /** Bitflag with all test set on for full memetest. */
  57 #define FULL_TESTS      0x3FFF
  58 /** Number of iterations in full memtest. */
  59 #define FULL_ITER       25
  60
  61 /** Bit flag with type of tests to run in time constrained memtest. */
  62 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  63
  64 /*! \brief
  65  * Max number of devices supported by CUDA (for consistency checking).
  66  *
  67  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  68  */
  69 static int cuda_max_device_count = 32;
  70
  71 /** Dummy kernel used for sanity checking. */
  72 __global__ void k_dummy_test()
  73 {
  74 }
  75
  76
  77 /** Bit-flags which refer to memtestG80 test types and are used in do_memtest
  78  * to specify which tests to run. */
  79 enum memtest_G80_test_types {
  80     MOVING_INVERSIONS_10   = 0x1,
  81     MOVING_INVERSIONS_RAND = 0x2,
  82     WALKING_8BIT_M86       = 0x4,
  83     WALKING_0_8BIT         = 0x8,
  84     WALKING_1_8BIT         = 0x10,
  85     WALKING_0_32BIT        = 0x20,
  86     WALKING_1_32BIT        = 0x40,
  87     RANDOM_BLOCKS          = 0x80,
  88     MOD_20_32BIT           = 0x100,
  89     LOGIC_1_ITER           = 0x200,
  90     LOGIC_4_ITER           = 0x400,
  91     LOGIC_1_ITER_SHMEM     = 0x800,
  92     LOGIC_4_ITER_SHMEM     = 0x1000
  93 };
  94
  95
  96 /*!
  97  * \brief Runs GPU sanity checks.
  98  *
  99  * Runs a series of checks to determine that the given GPU and underlying CUDA
 100  * driver/runtime functions properly.
 101  * Returns properties of a device with given ID or the one that has
 102  * already been initialized earlier in the case if of \dev_id == -1.
 103  *
 104  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 105  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 106  * \returns                0 if the device looks OK
 107  *
 108  * TODO: introduce errors codes and handle errors more smoothly.
 109  */
 110 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 111 {
 112     cudaError_t cu_err;
 113     int         dev_count, id;
 114
 115     cu_err = cudaGetDeviceCount(&dev_count);
 116     if (cu_err != cudaSuccess)
 117     {
 118         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 119                 cudaGetErrorString(cu_err));
 120         return -1;
 121     }
 122
 123     /* no CUDA compatible device at all */
 124     if (dev_count == 0)
 125     {
 126         return -1;
 127     }
 128
 129     /* things might go horribly wrong if cudart is not compatible with the driver */
 130     if (dev_count < 0 || dev_count > cuda_max_device_count)
 131     {
 132         return -1;
 133     }
 134
 135     if (dev_id == -1) /* device already selected let's not destroy the context */
 136     {
 137         cu_err = cudaGetDevice(&id);
 138         if (cu_err != cudaSuccess)
 139         {
 140             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 141                     cudaGetErrorString(cu_err));
 142             return -1;
 143         }
 144     }
 145     else
 146     {
 147         id = dev_id;
 148         if (id > dev_count - 1) /* pfff there's no such device */
 149         {
 150             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 151                     dev_id, dev_count);
 152             return -1;
 153         }
 154     }
 155
 156     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 157     cu_err = cudaGetDeviceProperties(dev_prop, id);
 158     if (cu_err != cudaSuccess)
 159     {
 160         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 161                 cudaGetErrorString(cu_err));
 162         return -1;
 163     }
 164
 165     /* both major & minor is 9999 if no CUDA capable devices are present */
 166     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 167     {
 168         return -1;
 169     }
 170     /* we don't care about emulation mode */
 171     if (dev_prop->major == 0)
 172     {
 173         return -1;
 174     }
 175
 176     if (id != -1)
 177     {
 178         cu_err = cudaSetDevice(id);
 179         if (cu_err != cudaSuccess)
 180         {
 181             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 182                     cu_err, id, cudaGetErrorString(cu_err));
 183             return -1;
 184         }
 185     }
 186
 187     /* try to execute a dummy kernel */
 188     k_dummy_test<<< 1, 512>>> ();
 189     if (cudaThreadSynchronize() != cudaSuccess)
 190     {
 191         return -1;
 192     }
 193
 194     /* destroy context if we created one */
 195     if (id != -1)
 196     {
 197 #if CUDA_VERSION < 4000
 198         cu_err = cudaThreadExit();
 199         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 200 #else
 201         cu_err = cudaDeviceReset();
 202         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 203 #endif
 204     }
 205
 206     return 0;
 207 }
 208
 209
 210 /*!
 211  * \brief Runs a set of memory tests specified by the given bit-flags.
 212  * Tries to allocate and do the test on \p megs Mb memory or
 213  * the greatest amount that can be allocated (>10Mb).
 214  * In case if an error is detected it stops without finishing the remaining
 215  * steps/iterations and returns greater then zero value.
 216  * In case of other errors (e.g. kernel launch errors, device querying errors)
 217  * -1 is returned.
 218  *
 219  * \param[in] which_tests   variable with bit-flags of the requested tests
 220  * \param[in] megs          amount of memory that will be tested in MB
 221  * \param[in] iter          number of iterations
 222  * \returns                 0 if no error was detected, otherwise >0
 223  */
 224 static int do_memtest(unsigned int which_tests, int megs, int iter)
 225 {
 226     memtestState    tester;
 227     int             i;
 228     uint            err_count; //, err_iter;
 229
 230     // no parameter check as this fn won't be called externally
 231
 232     // let's try to allocate the mem
 233     while (!tester.allocate(megs) && (megs - 10 > 0))
 234     {
 235         megs -= 10; tester.deallocate();
 236     }
 237
 238     if (megs <= 10)
 239     {
 240         fprintf(stderr, "Unable to allocate GPU memory!\n");
 241         return -1;
 242     }
 243
 244     // clear the first 18 bits
 245     which_tests &= 0x3FFF;
 246     for (i = 0; i < iter; i++)
 247     {
 248         // Moving Inversions (ones and zeros)
 249         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 250         {
 251             tester.gpuMovingInversionsOnesZeros(err_count);
 252             if (err_count > 0)
 253             {
 254                 return MOVING_INVERSIONS_10;
 255             }
 256         }
 257         // Moving Inversions (random)
 258         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 259         {
 260             tester.gpuMovingInversionsRandom(err_count);
 261             if (err_count > 0)
 262             {
 263                 return MOVING_INVERSIONS_RAND;
 264             }
 265         }
 266         // Memtest86 Walking 8-bit
 267         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 268         {
 269             for (uint shift = 0; shift < 8; shift++)
 270             {
 271                 tester.gpuWalking8BitM86(err_count, shift);
 272                 if (err_count > 0)
 273                 {
 274                     return WALKING_8BIT_M86;
 275                 }
 276             }
 277         }
 278         // True Walking zeros (8-bit)
 279         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 280         {
 281             for (uint shift = 0; shift < 8; shift++)
 282             {
 283                 tester.gpuWalking8Bit(err_count, false, shift);
 284                 if (err_count > 0)
 285                 {
 286                     return WALKING_0_8BIT;
 287                 }
 288             }
 289         }
 290         // True Walking ones (8-bit)
 291         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 292         {
 293             for (uint shift = 0; shift < 8; shift++)
 294             {
 295                 tester.gpuWalking8Bit(err_count, true, shift);
 296                 if (err_count > 0)
 297                 {
 298                     return WALKING_1_8BIT;
 299                 }
 300             }
 301         }
 302         // Memtest86 Walking zeros (32-bit)
 303         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 304         {
 305             for (uint shift = 0; shift < 32; shift++)
 306             {
 307                 tester.gpuWalking32Bit(err_count, false, shift);
 308                 if (err_count > 0)
 309                 {
 310                     return WALKING_0_32BIT;
 311                 }
 312             }
 313         }
 314         // Memtest86 Walking ones (32-bit)
 315         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 316         {
 317             for (uint shift = 0; shift < 32; shift++)
 318             {
 319                 tester.gpuWalking32Bit(err_count, true, shift);
 320                 if (err_count > 0)
 321                 {
 322                     return WALKING_1_32BIT;
 323                 }
 324             }
 325         }
 326         // Random blocks
 327         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 328         {
 329             tester.gpuRandomBlocks(err_count, rand());
 330             if (err_count > 0)
 331             {
 332                 return RANDOM_BLOCKS;
 333             }
 334
 335         }
 336
 337         // Memtest86 Modulo-20
 338         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 339         {
 340             for (uint shift = 0; shift < 20; shift++)
 341             {
 342                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 343                 if (err_count > 0)
 344                 {
 345                     return MOD_20_32BIT;
 346                 }
 347             }
 348         }
 349         // Logic (one iteration)
 350         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 351         {
 352             tester.gpuShortLCG0(err_count, 1);
 353             if (err_count > 0)
 354             {
 355                 return LOGIC_1_ITER;
 356             }
 357         }
 358         // Logic (4 iterations)
 359         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 360         {
 361             tester.gpuShortLCG0(err_count, 4);
 362             if (err_count > 0)
 363             {
 364                 return LOGIC_4_ITER;
 365             }
 366
 367         }
 368         // Logic (shared memory, one iteration)
 369         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 370         {
 371             tester.gpuShortLCG0Shmem(err_count, 1);
 372             if (err_count > 0)
 373             {
 374                 return LOGIC_1_ITER_SHMEM;
 375             }
 376         }
 377         // Logic (shared-memory, 4 iterations)
 378         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 379         {
 380             tester.gpuShortLCG0Shmem(err_count, 4);
 381             if (err_count > 0)
 382             {
 383                 return LOGIC_4_ITER_SHMEM;
 384             }
 385         }
 386     }
 387
 388     tester.deallocate();
 389     return err_count;
 390 }
 391
 392 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 393  * If an error is detected it stops before completing the test and returns a
 394  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 395  * device querying errors) -1 is returned.
 396  *
 397  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 398  * \returns             0 if no error was detected, otherwise >0
 399  */
 400 int do_quick_memtest(int dev_id)
 401 {
 402     cudaDeviceProp  dev_prop;
 403     int             devmem, res, time = 0;
 404
 405     if (debug)
 406     {
 407         time = getTimeMilliseconds();
 408     }
 409
 410     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 411     {
 412         // something went wrong
 413         return -1;
 414     }
 415
 416     if (debug)
 417     {
 418         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 419         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 420                 QUICK_MEM, devmem, QUICK_ITER);
 421     }
 422
 423     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 424
 425     if (debug)
 426     {
 427         fprintf(debug, "Q-RES = %d\n", res);
 428         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 429     }
 430
 431     /* destroy context only if we created it */
 432     if (dev_id != -1)
 433     {
 434         cudaThreadExit();
 435     }
 436     return res;
 437 }
 438
 439 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 440  * If an error is detected  it stops before completing the test and returns a
 441  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 442  * device querying errors) -1 is returned.
 443  *
 444  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 445  * \returns             0 if no error was detected, otherwise >0
 446  */
 447
 448 int do_full_memtest(int dev_id)
 449 {
 450     cudaDeviceProp  dev_prop;
 451     int             devmem, res, time = 0;
 452
 453     if (debug)
 454     {
 455         time = getTimeMilliseconds();
 456     }
 457
 458     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 459     {
 460         // something went wrong
 461         return -1;
 462     }
 463
 464     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 465
 466     if (debug)
 467     {
 468         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 469                 devmem, devmem, FULL_ITER);
 470     }
 471
 472     /* do all test on the entire memory */
 473     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 474
 475     if (debug)
 476     {
 477         fprintf(debug, "F-RES = %d\n", res);
 478         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 479     }
 480
 481     /* destroy context only if we created it */
 482     if (dev_id != -1)
 483     {
 484         cudaThreadExit();
 485     }
 486     return res;
 487 }
 488
 489 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 490  * If an error is detected it stops before completing the test and returns a value greater
 491  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 492  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 493  * the test will always be multipple of one iteration's runtime.
 494  *
 495  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 496  * \param[in] time_constr   the time limit of the testing
 497  * \returns                 0 if no error was detected, otherwise >0
 498  */
 499 int do_timed_memtest(int dev_id, int time_constr)
 500 {
 501     cudaDeviceProp  dev_prop;
 502     int             devmem, res = 0, time = 0, startt;
 503
 504     if (debug)
 505     {
 506         time = getTimeMilliseconds();
 507     }
 508
 509     time_constr *= 1000;  /* convert to ms for convenience */
 510     startt       = getTimeMilliseconds();
 511
 512     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 513     {
 514         // something went wrong
 515         return -1;
 516     }
 517
 518     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 519
 520     if (debug)
 521     {
 522         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 523                 devmem, devmem, time_constr);
 524     }
 525
 526     /* do the TIMED_TESTS set, one step at a time on the entire memory
 527        that can be allocated, and stop when the given time is exceeded */
 528     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 529     {
 530         res = do_memtest(TIMED_TESTS, devmem, 1);
 531         if (res != 0)
 532         {
 533             break;
 534         }
 535     }
 536
 537     if (debug)
 538     {
 539         fprintf(debug, "T-RES = %d\n", res);
 540         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 541     }
 542
 543     /* destroy context only if we created it */
 544     if (dev_id != -1)
 545     {
 546         cudaThreadExit();
 547     }
 548     return res;
 549 }
 550
 551 /*! \brief Initializes the GPU with the given index.
 552  *
 553  * The varible \mygpu is the index of the GPU to initialize in the
 554  * gpu_info.cuda_dev array.
 555  *
 556  * \param[in]  mygpu        index of the GPU to initialize
 557  * \param[out] result_str   the message related to the error that occurred
 558  *                          during the initialization (if there was any).
 559  * \param[in] gpu_info      GPU info of all detected devices in the system.
 560  * \param[in] gpu_opt       options for using the GPUs in gpu_info
 561  * \returns                 true if no error occurs during initialization.
 562  */
 563 gmx_bool init_gpu(int mygpu, char *result_str,
 564                   const gmx_gpu_info_t *gpu_info,
 565                   const gmx_gpu_opt_t *gpu_opt)
 566 {
 567     cudaError_t stat;
 568     char        sbuf[STRLEN];
 569     int         gpuid;
 570
 571     assert(gpu_info);
 572     assert(result_str);
 573
 574     if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
 575     {
 576         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 577                 "there are %d %s-selected GPU(s), but #%d was requested.",
 578                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 579         gmx_incons(sbuf);
 580     }
 581
 582     gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 583
 584     stat = cudaSetDevice(gpuid);
 585     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 586
 587     if (debug)
 588     {
 589         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 590     }
 591
 592     return (stat == cudaSuccess);
 593 }
 594
 595 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 596  *
 597  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 598  * is lost. This should only be called when none of this data is required anymore.
 599  *
 600  * \param[out] result_str   the message related to the error that occurred
 601  *                          during the initialization (if there was any).
 602  * \returns                 true if no error occurs during the freeing.
 603  */
 604 gmx_bool free_gpu(char *result_str)
 605 {
 606     cudaError_t stat;
 607
 608     assert(result_str);
 609
 610     if (debug)
 611     {
 612         int gpuid;
 613         stat = cudaGetDevice(&gpuid);
 614         CU_RET_ERR(stat, "cudaGetDevice failed");
 615         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 616     }
 617
 618 #if CUDA_VERSION < 4000
 619     stat = cudaThreadExit();
 620 #else
 621     stat = cudaDeviceReset();
 622 #endif
 623     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 624
 625     return (stat == cudaSuccess);
 626 }
 627
 628 /*! \brief Returns true if the gpu characterized by the device properties is
 629  *  supported by the native gpu acceleration.
 630  *
 631  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 632  * \returns             true if the GPU properties passed indicate a compatible
 633  *                      GPU, otherwise false.
 634  */
 635 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 636 {
 637     return (dev_prop->major >= 2);
 638 }
 639
 640 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 641  *
 642  * \param[in] stat  GPU status.
 643  * \returns         true if the provided status is egpuCompatible, otherwise false.
 644  */
 645 static bool is_compatible_gpu(int stat)
 646 {
 647     return (stat == egpuCompatible);
 648 }
 649
 650 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 651  *
 652  *  Returns a status value which indicates compatibility or one of the following
 653  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 654  *  It also returns the respective device's properties in \dev_prop (if applicable).
 655  *
 656  *  \param[in]  dev_id   the ID of the GPU to check.
 657  *  \param[out] dev_prop the CUDA device properties of the device checked.
 658  *  \returns             the status of the requested device
 659  */
 660 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 661 {
 662     cudaError_t stat;
 663     int         ndev;
 664
 665     stat = cudaGetDeviceCount(&ndev);
 666     if (stat != cudaSuccess)
 667     {
 668         return egpuInsane;
 669     }
 670
 671     if (dev_id > ndev - 1)
 672     {
 673         return egpuNonexistent;
 674     }
 675
 676     /* TODO: currently we do not make a distinction between the type of errors
 677      * that can appear during sanity checks. This needs to be improved, e.g if
 678      * the dummy test kernel fails to execute with a "device busy message" we
 679      * should appropriately report that the device is busy instead of insane.
 680      */
 681     if (do_sanity_checks(dev_id, dev_prop) == 0)
 682     {
 683         if (is_gmx_supported_gpu(dev_prop))
 684         {
 685             return egpuCompatible;
 686         }
 687         else
 688         {
 689             return egpuIncompatible;
 690         }
 691     }
 692     else
 693     {
 694         return egpuInsane;
 695     }
 696 }
 697
 698
 699 /*! \brief Detect all NVIDIA GPUs in the system.
 700  *
 701  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 702  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 703  *  with the required information on each the device: ID, device properties,
 704  *  status.
 705  *
 706  *  \param[in] gpu_info    pointer to structure holding GPU information.
 707  *  \param[out] err_str    The error message of any CUDA API error that caused
 708  *                         the detection to fail (if there was any). The memory
 709  *                         the pointer points to should be managed externally.
 710  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 711  */
 712 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 713 {
 714     int              i, ndev, checkres, retval;
 715     cudaError_t      stat;
 716     cudaDeviceProp   prop;
 717     cuda_dev_info_t *devs;
 718
 719     assert(gpu_info);
 720     assert(err_str);
 721
 722     gpu_info->ncuda_dev_compatible = 0;
 723
 724     ndev    = 0;
 725     devs    = NULL;
 726
 727     stat = cudaGetDeviceCount(&ndev);
 728     if (stat != cudaSuccess)
 729     {
 730         const char *s;
 731
 732         /* cudaGetDeviceCount failed which means that there is something
 733          * wrong with the machine: driver-runtime mismatch, all GPUs being
 734          * busy in exclusive mode, or some other condition which should
 735          * result in us issuing a warning a falling back to CPUs. */
 736         retval = -1;
 737         s      = cudaGetErrorString(stat);
 738         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 739     }
 740     else
 741     {
 742         snew(devs, ndev);
 743         for (i = 0; i < ndev; i++)
 744         {
 745             checkres = is_gmx_supported_gpu_id(i, &prop);
 746
 747             devs[i].id   = i;
 748             devs[i].prop = prop;
 749             devs[i].stat = checkres;
 750
 751             if (checkres == egpuCompatible)
 752             {
 753                 gpu_info->ncuda_dev_compatible++;
 754             }
 755         }
 756         retval = 0;
 757     }
 758
 759     gpu_info->ncuda_dev = ndev;
 760     gpu_info->cuda_dev  = devs;
 761
 762     return retval;
 763 }
 764
 765 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 766  *
 767  * This function selects the compatible gpus and initializes
 768  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 769  *
 770  * Given the list of GPUs available in the system check each device in
 771  * gpu_info->cuda_dev and place the indices of the compatible GPUs into
 772  * cuda_dev_use with this marking the respective GPUs as "available for use."
 773  * Note that \detect_cuda_gpus must have been called before.
 774  *
 775  * \param[in]     gpu_info    pointer to structure holding GPU information
 776  * \param[in,out] gpu_opt     pointer to structure holding GPU options
 777  */
 778 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 779                           gmx_gpu_opt_t        *gpu_opt)
 780 {
 781     int  i, ncompat;
 782     int *compat;
 783
 784     assert(gpu_info);
 785     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 786     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 787
 788     snew(compat, gpu_info->ncuda_dev);
 789     ncompat = 0;
 790     for (i = 0; i < gpu_info->ncuda_dev; i++)
 791     {
 792         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 793         {
 794             ncompat++;
 795             compat[ncompat - 1] = i;
 796         }
 797     }
 798
 799     gpu_opt->ncuda_dev_use = ncompat;
 800     snew(gpu_opt->cuda_dev_use, ncompat);
 801     memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
 802     sfree(compat);
 803 }
 804
 805 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 806  *
 807  * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
 808  * gpu_opt->cuda_dev_use check the existence and compatibility
 809  * of the respective GPUs. Also provide the caller with an array containing
 810  * the result of checks in \checkres.
 811  *
 812  * \param[out]  checkres    check result for each ID passed in \requested_devs
 813  * \param[in]   gpu_info    pointer to structure holding GPU information
 814  * \param[out]  gpu_opt     pointer to structure holding GPU options
 815  * \returns                 TRUE if every the requested GPUs are compatible
 816  */
 817 gmx_bool check_selected_cuda_gpus(int                  *checkres,
 818                                   const gmx_gpu_info_t *gpu_info,
 819                                   gmx_gpu_opt_t        *gpu_opt)
 820 {
 821     int  i, id;
 822     bool bAllOk;
 823
 824     assert(checkres);
 825     assert(gpu_info);
 826     assert(gpu_opt->ncuda_dev_use >= 0);
 827
 828     if (gpu_opt->ncuda_dev_use == 0)
 829     {
 830         return TRUE;
 831     }
 832
 833     assert(gpu_opt->cuda_dev_use);
 834
 835     /* we will assume that all GPUs requested are valid IDs,
 836        otherwise we'll bail anyways */
 837
 838     bAllOk = true;
 839     for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
 840     {
 841         id = gpu_opt->cuda_dev_use[i];
 842
 843         /* devices are stored in increasing order of IDs in cuda_dev */
 844         gpu_opt->cuda_dev_use[i] = id;
 845
 846         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 847             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 848
 849         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 850     }
 851
 852     return bAllOk;
 853 }
 854
 855 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 856  *
 857  * \param[in]    gpu_info    pointer to structure holding GPU information
 858  */
 859 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 860 {
 861     if (gpu_info == NULL)
 862     {
 863         return;
 864     }
 865
 866     sfree(gpu_info->cuda_dev);
 867 }
 868
 869 /*! \brief Formats and returns a device information string for a given GPU.
 870  *
 871  * Given an index *directly* into the array of available GPUs (cuda_dev)
 872  * returns a formatted info string for the respective GPU which includes
 873  * ID, name, compute capability, and detection status.
 874  *
 875  * \param[out]  s           pointer to output string (has to be allocated externally)
 876  * \param[in]   gpu_info    pointer to structure holding GPU information
 877  * \param[in]   index       an index *directly* into the array of available GPUs
 878  */
 879 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 880 {
 881     assert(s);
 882     assert(gpu_info);
 883
 884     if (index < 0 && index >= gpu_info->ncuda_dev)
 885     {
 886         return;
 887     }
 888
 889     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 890
 891     bool             bGpuExists =
 892         dinfo->stat == egpuCompatible ||
 893         dinfo->stat == egpuIncompatible;
 894
 895     if (!bGpuExists)
 896     {
 897         sprintf(s, "#%d: %s, stat: %s",
 898                 dinfo->id, "N/A",
 899                 gpu_detect_res_str[dinfo->stat]);
 900     }
 901     else
 902     {
 903         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 904                 dinfo->id, dinfo->prop.name,
 905                 dinfo->prop.major, dinfo->prop.minor,
 906                 dinfo->prop.ECCEnabled ? "yes" : " no",
 907                 gpu_detect_res_str[dinfo->stat]);
 908     }
 909 }
 910
 911 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 912  *
 913  * Getter function which, given an index into the array of GPUs in use
 914  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 915  * respective CUDA GPU.
 916  *
 917  * \param[in]    gpu_info   pointer to structure holding GPU information
 918  * \param[in]    gpu_opt    pointer to structure holding GPU options
 919  * \param[in]    idx        index into the array of used GPUs
 920  * \returns                 device ID of the requested GPU
 921  */
 922 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 923                       const gmx_gpu_opt_t  *gpu_opt,
 924                       int                   idx)
 925 {
 926     assert(gpu_info);
 927     assert(gpu_opt);
 928     assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
 929
 930     return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 931 }
 932
 933 /*! \brief Returns the device ID of the GPU currently in use.
 934  *
 935  * The GPU used is the one that is active at the time of the call in the active context.
 936  *
 937  * \param[in]    gpu_info   pointer to structure holding GPU information
 938  * \returns                 device ID of the GPU in use at the time of the call
 939  */
 940 int get_current_gpu_device_id(void)
 941 {
 942     int gpuid;
 943     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 944
 945     return gpuid;
 946 }
 947
 948 /*! \brief Returns the size of the cuda_dev_info struct.
 949  *
 950  * The size of cuda_dev_info can be used for allocation and communication.
 951  *
 952  * \returns                 size in bytes of cuda_dev_info
 953  */
 954 size_t sizeof_cuda_dev_info(void)
 955 {
 956     return sizeof(cuda_dev_info);
 957 }