src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <assert.h>
  39
  40 #include "gromacs/utility/smalloc.h"
  41 #include "types/hw_info.h"
  42
  43 #include "gpu_utils.h"
  44 #include "../cuda_tools/cudautils.cuh"
  45 #include "memtestG80_core.h"
  46
  47 /** Amount of memory to be used in quick memtest. */
  48 #define QUICK_MEM       250
  49 /** Bit flag with type of tests to run in quick memtest. */
  50 #define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  51 /** Number of iterations in quick memtest. */
  52 #define QUICK_ITER      3
  53
  54 /** Bitflag with all test set on for full memetest. */
  55 #define FULL_TESTS      0x3FFF
  56 /** Number of iterations in full memtest. */
  57 #define FULL_ITER       25
  58
  59 /** Bit flag with type of tests to run in time constrained memtest. */
  60 #define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS
  61
  62 /*! \brief
  63  * Max number of devices supported by CUDA (for consistency checking).
  64  *
  65  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  66  */
  67 static int cuda_max_device_count = 32;
  68
  69 /** Dummy kernel used for sanity checking. */
  70 __global__ void k_dummy_test()
  71 {
  72 }
  73
  74
  75 /** Bit-flags which refer to memtestG80 test types and are used in do_memtest
  76  * to specify which tests to run. */
  77 enum memtest_G80_test_types {
  78     MOVING_INVERSIONS_10   = 0x1,
  79     MOVING_INVERSIONS_RAND = 0x2,
  80     WALKING_8BIT_M86       = 0x4,
  81     WALKING_0_8BIT         = 0x8,
  82     WALKING_1_8BIT         = 0x10,
  83     WALKING_0_32BIT        = 0x20,
  84     WALKING_1_32BIT        = 0x40,
  85     RANDOM_BLOCKS          = 0x80,
  86     MOD_20_32BIT           = 0x100,
  87     LOGIC_1_ITER           = 0x200,
  88     LOGIC_4_ITER           = 0x400,
  89     LOGIC_1_ITER_SHMEM     = 0x800,
  90     LOGIC_4_ITER_SHMEM     = 0x1000
  91 };
  92
  93
  94 /*!
  95  * \brief Runs GPU sanity checks.
  96  *
  97  * Runs a series of checks to determine that the given GPU and underlying CUDA
  98  * driver/runtime functions properly.
  99  * Returns properties of a device with given ID or the one that has
 100  * already been initialized earlier in the case if of \dev_id == -1.
 101  *
 102  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 103  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 104  * \returns                0 if the device looks OK
 105  *
 106  * TODO: introduce errors codes and handle errors more smoothly.
 107  */
 108 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 109 {
 110     cudaError_t cu_err;
 111     int         dev_count, id;
 112
 113     cu_err = cudaGetDeviceCount(&dev_count);
 114     if (cu_err != cudaSuccess)
 115     {
 116         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 117                 cudaGetErrorString(cu_err));
 118         return -1;
 119     }
 120
 121     /* no CUDA compatible device at all */
 122     if (dev_count == 0)
 123     {
 124         return -1;
 125     }
 126
 127     /* things might go horribly wrong if cudart is not compatible with the driver */
 128     if (dev_count < 0 || dev_count > cuda_max_device_count)
 129     {
 130         return -1;
 131     }
 132
 133     if (dev_id == -1) /* device already selected let's not destroy the context */
 134     {
 135         cu_err = cudaGetDevice(&id);
 136         if (cu_err != cudaSuccess)
 137         {
 138             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 139                     cudaGetErrorString(cu_err));
 140             return -1;
 141         }
 142     }
 143     else
 144     {
 145         id = dev_id;
 146         if (id > dev_count - 1) /* pfff there's no such device */
 147         {
 148             fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 149                     dev_id, dev_count);
 150             return -1;
 151         }
 152     }
 153
 154     memset(dev_prop, 0, sizeof(cudaDeviceProp));
 155     cu_err = cudaGetDeviceProperties(dev_prop, id);
 156     if (cu_err != cudaSuccess)
 157     {
 158         fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 159                 cudaGetErrorString(cu_err));
 160         return -1;
 161     }
 162
 163     /* both major & minor is 9999 if no CUDA capable devices are present */
 164     if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 165     {
 166         return -1;
 167     }
 168     /* we don't care about emulation mode */
 169     if (dev_prop->major == 0)
 170     {
 171         return -1;
 172     }
 173
 174     if (id != -1)
 175     {
 176         cu_err = cudaSetDevice(id);
 177         if (cu_err != cudaSuccess)
 178         {
 179             fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 180                     cu_err, id, cudaGetErrorString(cu_err));
 181             return -1;
 182         }
 183     }
 184
 185     /* try to execute a dummy kernel */
 186     k_dummy_test<<< 1, 512>>> ();
 187     if (cudaThreadSynchronize() != cudaSuccess)
 188     {
 189         return -1;
 190     }
 191
 192     /* destroy context if we created one */
 193     if (id != -1)
 194     {
 195 #if CUDA_VERSION < 4000
 196         cu_err = cudaThreadExit();
 197         CU_RET_ERR(cu_err, "cudaThreadExit failed");
 198 #else
 199         cu_err = cudaDeviceReset();
 200         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 201 #endif
 202     }
 203
 204     return 0;
 205 }
 206
 207
 208 /*!
 209  * \brief Runs a set of memory tests specified by the given bit-flags.
 210  * Tries to allocate and do the test on \p megs Mb memory or
 211  * the greatest amount that can be allocated (>10Mb).
 212  * In case if an error is detected it stops without finishing the remaining
 213  * steps/iterations and returns greater then zero value.
 214  * In case of other errors (e.g. kernel launch errors, device querying errors)
 215  * -1 is returned.
 216  *
 217  * \param[in] which_tests   variable with bit-flags of the requested tests
 218  * \param[in] megs          amount of memory that will be tested in MB
 219  * \param[in] iter          number of iterations
 220  * \returns                 0 if no error was detected, otherwise >0
 221  */
 222 static int do_memtest(unsigned int which_tests, int megs, int iter)
 223 {
 224     memtestState    tester;
 225     int             i;
 226     uint            err_count; //, err_iter;
 227
 228     // no parameter check as this fn won't be called externally
 229
 230     // let's try to allocate the mem
 231     while (!tester.allocate(megs) && (megs - 10 > 0))
 232     {
 233         megs -= 10; tester.deallocate();
 234     }
 235
 236     if (megs <= 10)
 237     {
 238         fprintf(stderr, "Unable to allocate GPU memory!\n");
 239         return -1;
 240     }
 241
 242     // clear the first 18 bits
 243     which_tests &= 0x3FFF;
 244     for (i = 0; i < iter; i++)
 245     {
 246         // Moving Inversions (ones and zeros)
 247         if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 248         {
 249             tester.gpuMovingInversionsOnesZeros(err_count);
 250             if (err_count > 0)
 251             {
 252                 return MOVING_INVERSIONS_10;
 253             }
 254         }
 255         // Moving Inversions (random)
 256         if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 257         {
 258             tester.gpuMovingInversionsRandom(err_count);
 259             if (err_count > 0)
 260             {
 261                 return MOVING_INVERSIONS_RAND;
 262             }
 263         }
 264         // Memtest86 Walking 8-bit
 265         if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 266         {
 267             for (uint shift = 0; shift < 8; shift++)
 268             {
 269                 tester.gpuWalking8BitM86(err_count, shift);
 270                 if (err_count > 0)
 271                 {
 272                     return WALKING_8BIT_M86;
 273                 }
 274             }
 275         }
 276         // True Walking zeros (8-bit)
 277         if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 278         {
 279             for (uint shift = 0; shift < 8; shift++)
 280             {
 281                 tester.gpuWalking8Bit(err_count, false, shift);
 282                 if (err_count > 0)
 283                 {
 284                     return WALKING_0_8BIT;
 285                 }
 286             }
 287         }
 288         // True Walking ones (8-bit)
 289         if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 290         {
 291             for (uint shift = 0; shift < 8; shift++)
 292             {
 293                 tester.gpuWalking8Bit(err_count, true, shift);
 294                 if (err_count > 0)
 295                 {
 296                     return WALKING_1_8BIT;
 297                 }
 298             }
 299         }
 300         // Memtest86 Walking zeros (32-bit)
 301         if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 302         {
 303             for (uint shift = 0; shift < 32; shift++)
 304             {
 305                 tester.gpuWalking32Bit(err_count, false, shift);
 306                 if (err_count > 0)
 307                 {
 308                     return WALKING_0_32BIT;
 309                 }
 310             }
 311         }
 312         // Memtest86 Walking ones (32-bit)
 313         if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 314         {
 315             for (uint shift = 0; shift < 32; shift++)
 316             {
 317                 tester.gpuWalking32Bit(err_count, true, shift);
 318                 if (err_count > 0)
 319                 {
 320                     return WALKING_1_32BIT;
 321                 }
 322             }
 323         }
 324         // Random blocks
 325         if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 326         {
 327             tester.gpuRandomBlocks(err_count, rand());
 328             if (err_count > 0)
 329             {
 330                 return RANDOM_BLOCKS;
 331             }
 332
 333         }
 334
 335         // Memtest86 Modulo-20
 336         if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 337         {
 338             for (uint shift = 0; shift < 20; shift++)
 339             {
 340                 tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 341                 if (err_count > 0)
 342                 {
 343                     return MOD_20_32BIT;
 344                 }
 345             }
 346         }
 347         // Logic (one iteration)
 348         if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 349         {
 350             tester.gpuShortLCG0(err_count, 1);
 351             if (err_count > 0)
 352             {
 353                 return LOGIC_1_ITER;
 354             }
 355         }
 356         // Logic (4 iterations)
 357         if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 358         {
 359             tester.gpuShortLCG0(err_count, 4);
 360             if (err_count > 0)
 361             {
 362                 return LOGIC_4_ITER;
 363             }
 364
 365         }
 366         // Logic (shared memory, one iteration)
 367         if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 368         {
 369             tester.gpuShortLCG0Shmem(err_count, 1);
 370             if (err_count > 0)
 371             {
 372                 return LOGIC_1_ITER_SHMEM;
 373             }
 374         }
 375         // Logic (shared-memory, 4 iterations)
 376         if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 377         {
 378             tester.gpuShortLCG0Shmem(err_count, 4);
 379             if (err_count > 0)
 380             {
 381                 return LOGIC_4_ITER_SHMEM;
 382             }
 383         }
 384     }
 385
 386     tester.deallocate();
 387     return err_count;
 388 }
 389
 390 /*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 391  * If an error is detected it stops before completing the test and returns a
 392  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 393  * device querying errors) -1 is returned.
 394  *
 395  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 396  * \returns             0 if no error was detected, otherwise >0
 397  */
 398 int do_quick_memtest(int dev_id)
 399 {
 400     cudaDeviceProp  dev_prop;
 401     int             devmem, res, time = 0;
 402
 403     if (debug)
 404     {
 405         time = getTimeMilliseconds();
 406     }
 407
 408     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 409     {
 410         // something went wrong
 411         return -1;
 412     }
 413
 414     if (debug)
 415     {
 416         devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 417         fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 418                 QUICK_MEM, devmem, QUICK_ITER);
 419     }
 420
 421     res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 422
 423     if (debug)
 424     {
 425         fprintf(debug, "Q-RES = %d\n", res);
 426         fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 427     }
 428
 429     /* destroy context only if we created it */
 430     if (dev_id != -1)
 431     {
 432         cudaThreadExit();
 433     }
 434     return res;
 435 }
 436
 437 /*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 438  * If an error is detected  it stops before completing the test and returns a
 439  * value greater then 0. In case of other errors (e.g. kernel launch errors,
 440  * device querying errors) -1 is returned.
 441  *
 442  * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 443  * \returns             0 if no error was detected, otherwise >0
 444  */
 445
 446 int do_full_memtest(int dev_id)
 447 {
 448     cudaDeviceProp  dev_prop;
 449     int             devmem, res, time = 0;
 450
 451     if (debug)
 452     {
 453         time = getTimeMilliseconds();
 454     }
 455
 456     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 457     {
 458         // something went wrong
 459         return -1;
 460     }
 461
 462     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 463
 464     if (debug)
 465     {
 466         fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 467                 devmem, devmem, FULL_ITER);
 468     }
 469
 470     /* do all test on the entire memory */
 471     res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 472
 473     if (debug)
 474     {
 475         fprintf(debug, "F-RES = %d\n", res);
 476         fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 477     }
 478
 479     /* destroy context only if we created it */
 480     if (dev_id != -1)
 481     {
 482         cudaThreadExit();
 483     }
 484     return res;
 485 }
 486
 487 /*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 488  * If an error is detected it stops before completing the test and returns a value greater
 489  * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 490  * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 491  * the test will always be multipple of one iteration's runtime.
 492  *
 493  * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 494  * \param[in] time_constr   the time limit of the testing
 495  * \returns                 0 if no error was detected, otherwise >0
 496  */
 497 int do_timed_memtest(int dev_id, int time_constr)
 498 {
 499     cudaDeviceProp  dev_prop;
 500     int             devmem, res = 0, time = 0, startt;
 501
 502     if (debug)
 503     {
 504         time = getTimeMilliseconds();
 505     }
 506
 507     time_constr *= 1000;  /* convert to ms for convenience */
 508     startt       = getTimeMilliseconds();
 509
 510     if (do_sanity_checks(dev_id, &dev_prop) != 0)
 511     {
 512         // something went wrong
 513         return -1;
 514     }
 515
 516     devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 517
 518     if (debug)
 519     {
 520         fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 521                 devmem, devmem, time_constr);
 522     }
 523
 524     /* do the TIMED_TESTS set, one step at a time on the entire memory
 525        that can be allocated, and stop when the given time is exceeded */
 526     while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 527     {
 528         res = do_memtest(TIMED_TESTS, devmem, 1);
 529         if (res != 0)
 530         {
 531             break;
 532         }
 533     }
 534
 535     if (debug)
 536     {
 537         fprintf(debug, "T-RES = %d\n", res);
 538         fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 539     }
 540
 541     /* destroy context only if we created it */
 542     if (dev_id != -1)
 543     {
 544         cudaThreadExit();
 545     }
 546     return res;
 547 }
 548
 549 /*! \brief Initializes the GPU with the given index.
 550  *
 551  * The varible \mygpu is the index of the GPU to initialize in the
 552  * gpu_info.cuda_dev array.
 553  *
 554  * \param[in]  mygpu        index of the GPU to initialize
 555  * \param[out] result_str   the message related to the error that occurred
 556  *                          during the initialization (if there was any).
 557  * \param[in] gpu_info      GPU info of all detected devices in the system.
 558  * \param[in] gpu_opt       options for using the GPUs in gpu_info
 559  * \returns                 true if no error occurs during initialization.
 560  */
 561 gmx_bool init_gpu(int mygpu, char *result_str,
 562                   const gmx_gpu_info_t *gpu_info,
 563                   const gmx_gpu_opt_t *gpu_opt)
 564 {
 565     cudaError_t stat;
 566     char        sbuf[STRLEN];
 567     int         gpuid;
 568
 569     assert(gpu_info);
 570     assert(result_str);
 571
 572     if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
 573     {
 574         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 575                 "there are %d %s-selected GPU(s), but #%d was requested.",
 576                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 577         gmx_incons(sbuf);
 578     }
 579
 580     gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 581
 582     stat = cudaSetDevice(gpuid);
 583     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 584
 585     if (debug)
 586     {
 587         fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 588     }
 589
 590     return (stat == cudaSuccess);
 591 }
 592
 593 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 594  *
 595  * The context is explicitly destroyed and therefore all data uploaded to the GPU
 596  * is lost. This should only be called when none of this data is required anymore.
 597  *
 598  * \param[out] result_str   the message related to the error that occurred
 599  *                          during the initialization (if there was any).
 600  * \returns                 true if no error occurs during the freeing.
 601  */
 602 gmx_bool free_gpu(char *result_str)
 603 {
 604     cudaError_t stat;
 605
 606     assert(result_str);
 607
 608     if (debug)
 609     {
 610         int gpuid;
 611         stat = cudaGetDevice(&gpuid);
 612         CU_RET_ERR(stat, "cudaGetDevice failed");
 613         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 614     }
 615
 616 #if CUDA_VERSION < 4000
 617     stat = cudaThreadExit();
 618 #else
 619     stat = cudaDeviceReset();
 620 #endif
 621     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 622
 623     return (stat == cudaSuccess);
 624 }
 625
 626 /*! \brief Returns true if the gpu characterized by the device properties is
 627  *  supported by the native gpu acceleration.
 628  *
 629  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 630  * \returns             true if the GPU properties passed indicate a compatible
 631  *                      GPU, otherwise false.
 632  */
 633 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 634 {
 635     return (dev_prop->major >= 2);
 636 }
 637
 638 /*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 639  *
 640  * \param[in] stat  GPU status.
 641  * \returns         true if the provided status is egpuCompatible, otherwise false.
 642  */
 643 static bool is_compatible_gpu(int stat)
 644 {
 645     return (stat == egpuCompatible);
 646 }
 647
 648 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 649  *
 650  *  Returns a status value which indicates compatibility or one of the following
 651  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 652  *  It also returns the respective device's properties in \dev_prop (if applicable).
 653  *
 654  *  \param[in]  dev_id   the ID of the GPU to check.
 655  *  \param[out] dev_prop the CUDA device properties of the device checked.
 656  *  \returns             the status of the requested device
 657  */
 658 static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 659 {
 660     cudaError_t stat;
 661     int         ndev;
 662
 663     stat = cudaGetDeviceCount(&ndev);
 664     if (stat != cudaSuccess)
 665     {
 666         return egpuInsane;
 667     }
 668
 669     if (dev_id > ndev - 1)
 670     {
 671         return egpuNonexistent;
 672     }
 673
 674     /* TODO: currently we do not make a distinction between the type of errors
 675      * that can appear during sanity checks. This needs to be improved, e.g if
 676      * the dummy test kernel fails to execute with a "device busy message" we
 677      * should appropriately report that the device is busy instead of insane.
 678      */
 679     if (do_sanity_checks(dev_id, dev_prop) == 0)
 680     {
 681         if (is_gmx_supported_gpu(dev_prop))
 682         {
 683             return egpuCompatible;
 684         }
 685         else
 686         {
 687             return egpuIncompatible;
 688         }
 689     }
 690     else
 691     {
 692         return egpuInsane;
 693     }
 694 }
 695
 696
 697 /*! \brief Detect all NVIDIA GPUs in the system.
 698  *
 699  *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 700  *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 701  *  with the required information on each the device: ID, device properties,
 702  *  status.
 703  *
 704  *  \param[in] gpu_info    pointer to structure holding GPU information.
 705  *  \param[out] err_str    The error message of any CUDA API error that caused
 706  *                         the detection to fail (if there was any). The memory
 707  *                         the pointer points to should be managed externally.
 708  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 709  */
 710 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 711 {
 712     int              i, ndev, checkres, retval;
 713     cudaError_t      stat;
 714     cudaDeviceProp   prop;
 715     cuda_dev_info_t *devs;
 716
 717     assert(gpu_info);
 718     assert(err_str);
 719
 720     gpu_info->ncuda_dev_compatible = 0;
 721
 722     ndev    = 0;
 723     devs    = NULL;
 724
 725     stat = cudaGetDeviceCount(&ndev);
 726     if (stat != cudaSuccess)
 727     {
 728         const char *s;
 729
 730         /* cudaGetDeviceCount failed which means that there is something
 731          * wrong with the machine: driver-runtime mismatch, all GPUs being
 732          * busy in exclusive mode, or some other condition which should
 733          * result in us issuing a warning a falling back to CPUs. */
 734         retval = -1;
 735         s      = cudaGetErrorString(stat);
 736         strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 737     }
 738     else
 739     {
 740         snew(devs, ndev);
 741         for (i = 0; i < ndev; i++)
 742         {
 743             checkres = is_gmx_supported_gpu_id(i, &prop);
 744
 745             devs[i].id   = i;
 746             devs[i].prop = prop;
 747             devs[i].stat = checkres;
 748
 749             if (checkres == egpuCompatible)
 750             {
 751                 gpu_info->ncuda_dev_compatible++;
 752             }
 753         }
 754         retval = 0;
 755     }
 756
 757     gpu_info->ncuda_dev = ndev;
 758     gpu_info->cuda_dev  = devs;
 759
 760     return retval;
 761 }
 762
 763 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 764  *
 765  * This function selects the compatible gpus and initializes
 766  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 767  *
 768  * Given the list of GPUs available in the system check each device in
 769  * gpu_info->cuda_dev and place the indices of the compatible GPUs into
 770  * cuda_dev_use with this marking the respective GPUs as "available for use."
 771  * Note that \detect_cuda_gpus must have been called before.
 772  *
 773  * \param[in]     gpu_info    pointer to structure holding GPU information
 774  * \param[in,out] gpu_opt     pointer to structure holding GPU options
 775  */
 776 void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 777                           gmx_gpu_opt_t        *gpu_opt)
 778 {
 779     int  i, ncompat;
 780     int *compat;
 781
 782     assert(gpu_info);
 783     /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 784     assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 785
 786     snew(compat, gpu_info->ncuda_dev);
 787     ncompat = 0;
 788     for (i = 0; i < gpu_info->ncuda_dev; i++)
 789     {
 790         if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 791         {
 792             ncompat++;
 793             compat[ncompat - 1] = i;
 794         }
 795     }
 796
 797     gpu_opt->ncuda_dev_use = ncompat;
 798     snew(gpu_opt->cuda_dev_use, ncompat);
 799     memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
 800     sfree(compat);
 801 }
 802
 803 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 804  *
 805  * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
 806  * gpu_opt->cuda_dev_use check the existence and compatibility
 807  * of the respective GPUs. Also provide the caller with an array containing
 808  * the result of checks in \checkres.
 809  *
 810  * \param[out]  checkres    check result for each ID passed in \requested_devs
 811  * \param[in]   gpu_info    pointer to structure holding GPU information
 812  * \param[out]  gpu_opt     pointer to structure holding GPU options
 813  * \returns                 TRUE if every the requested GPUs are compatible
 814  */
 815 gmx_bool check_selected_cuda_gpus(int                  *checkres,
 816                                   const gmx_gpu_info_t *gpu_info,
 817                                   gmx_gpu_opt_t        *gpu_opt)
 818 {
 819     int  i, id;
 820     bool bAllOk;
 821
 822     assert(checkres);
 823     assert(gpu_info);
 824     assert(gpu_opt->ncuda_dev_use >= 0);
 825
 826     if (gpu_opt->ncuda_dev_use == 0)
 827     {
 828         return TRUE;
 829     }
 830
 831     assert(gpu_opt->cuda_dev_use);
 832
 833     /* we will assume that all GPUs requested are valid IDs,
 834        otherwise we'll bail anyways */
 835
 836     bAllOk = true;
 837     for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
 838     {
 839         id = gpu_opt->cuda_dev_use[i];
 840
 841         /* devices are stored in increasing order of IDs in cuda_dev */
 842         gpu_opt->cuda_dev_use[i] = id;
 843
 844         checkres[i] = (id >= gpu_info->ncuda_dev) ?
 845             egpuNonexistent : gpu_info->cuda_dev[id].stat;
 846
 847         bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 848     }
 849
 850     return bAllOk;
 851 }
 852
 853 /*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 854  *
 855  * \param[in]    gpu_info    pointer to structure holding GPU information
 856  */
 857 void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 858 {
 859     if (gpu_info == NULL)
 860     {
 861         return;
 862     }
 863
 864     sfree(gpu_info->cuda_dev);
 865 }
 866
 867 /*! \brief Formats and returns a device information string for a given GPU.
 868  *
 869  * Given an index *directly* into the array of available GPUs (cuda_dev)
 870  * returns a formatted info string for the respective GPU which includes
 871  * ID, name, compute capability, and detection status.
 872  *
 873  * \param[out]  s           pointer to output string (has to be allocated externally)
 874  * \param[in]   gpu_info    pointer to structure holding GPU information
 875  * \param[in]   index       an index *directly* into the array of available GPUs
 876  */
 877 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 878 {
 879     assert(s);
 880     assert(gpu_info);
 881
 882     if (index < 0 && index >= gpu_info->ncuda_dev)
 883     {
 884         return;
 885     }
 886
 887     cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 888
 889     bool             bGpuExists =
 890         dinfo->stat == egpuCompatible ||
 891         dinfo->stat == egpuIncompatible;
 892
 893     if (!bGpuExists)
 894     {
 895         sprintf(s, "#%d: %s, stat: %s",
 896                 dinfo->id, "N/A",
 897                 gpu_detect_res_str[dinfo->stat]);
 898     }
 899     else
 900     {
 901         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 902                 dinfo->id, dinfo->prop.name,
 903                 dinfo->prop.major, dinfo->prop.minor,
 904                 dinfo->prop.ECCEnabled ? "yes" : " no",
 905                 gpu_detect_res_str[dinfo->stat]);
 906     }
 907 }
 908
 909 /*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 910  *
 911  * Getter function which, given an index into the array of GPUs in use
 912  * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 913  * respective CUDA GPU.
 914  *
 915  * \param[in]    gpu_info   pointer to structure holding GPU information
 916  * \param[in]    gpu_opt    pointer to structure holding GPU options
 917  * \param[in]    idx        index into the array of used GPUs
 918  * \returns                 device ID of the requested GPU
 919  */
 920 int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 921                       const gmx_gpu_opt_t  *gpu_opt,
 922                       int                   idx)
 923 {
 924     assert(gpu_info);
 925     assert(gpu_opt);
 926     assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
 927
 928     return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 929 }
 930
 931 /*! \brief Returns the device ID of the GPU currently in use.
 932  *
 933  * The GPU used is the one that is active at the time of the call in the active context.
 934  *
 935  * \param[in]    gpu_info   pointer to structure holding GPU information
 936  * \returns                 device ID of the GPU in use at the time of the call
 937  */
 938 int get_current_gpu_device_id(void)
 939 {
 940     int gpuid;
 941     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 942
 943     return gpuid;
 944 }
 945
 946 /*! \brief Returns the size of the cuda_dev_info struct.
 947  *
 948  * The size of cuda_dev_info can be used for allocation and communication.
 949  *
 950  * \returns                 size in bytes of cuda_dev_info
 951  */
 952 size_t sizeof_cuda_dev_info(void)
 953 {
 954     return sizeof(cuda_dev_info);
 955 }