src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, The GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define functions for detection and initialization for CUDA devices.
  38  *
  39  *  \author Szilard Pall <pall.szilard@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "gpu_utils.h"
  45
  46 #include <assert.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49
  50 #include <cuda_profiler_api.h>
  51
  52 #include "gromacs/gpu_utils/cudautils.cuh"
  53 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  54 #include "gromacs/hardware/gpu_hw_info.h"
  55 #include "gromacs/utility/basedefinitions.h"
  56 #include "gromacs/utility/cstringutil.h"
  57 #include "gromacs/utility/exceptions.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/gmxassert.h"
  60 #include "gromacs/utility/logger.h"
  61 #include "gromacs/utility/programcontext.h"
  62 #include "gromacs/utility/smalloc.h"
  63 #include "gromacs/utility/snprintf.h"
  64 #include "gromacs/utility/stringutil.h"
  65
  66 /*! \internal \brief
  67  * Max number of devices supported by CUDA (for consistency checking).
  68  *
  69  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  70  */
  71 static int cuda_max_device_count = 32;
  72
  73 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  74
  75 /** Dummy kernel used for sanity checking. */
  76 static __global__ void k_dummy_test(void) {}
  77
  78 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  79 {
  80     cudaFuncAttributes attributes;
  81     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  82
  83     if (cudaErrorInvalidDeviceFunction == stat)
  84     {
  85         fprintf(stderr,
  86                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  87                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  88                 "By default, GROMACS supports all architectures of compute "
  89                 "capability >= 3.0, so your GPU "
  90                 "might be rare, or some architectures were disabled in the build. \n"
  91                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  92                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  93                 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
  94     }
  95
  96     return stat;
  97 }
  98
  99 bool isHostMemoryPinned(const void* h_ptr)
 100 {
 101     cudaPointerAttributes memoryAttributes;
 102     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 103
 104     bool result = false;
 105     switch (stat)
 106     {
 107         case cudaSuccess: result = true; break;
 108
 109         case cudaErrorInvalidValue:
 110             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 111             result = false;
 112             // Reset the last error status
 113             cudaGetLastError();
 114             break;
 115
 116         default: CU_RET_ERR(stat, "Unexpected CUDA error");
 117     }
 118     return result;
 119 }
 120
 121 /*!
 122  * \brief Runs GPU sanity checks.
 123  *
 124  * Runs a series of checks to determine that the given GPU and underlying CUDA
 125  * driver/runtime functions properly.
 126  *
 127  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 128  * \param[in]  dev_prop    The device properties structure
 129  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 130  *
 131  * TODO: introduce errors codes and handle errors more smoothly.
 132  */
 133 static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
 134 {
 135     cudaError_t cu_err;
 136     int         dev_count, id;
 137
 138     cu_err = cudaGetDeviceCount(&dev_count);
 139     if (cu_err != cudaSuccess)
 140     {
 141         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 142         return -1;
 143     }
 144
 145     /* no CUDA compatible device at all */
 146     if (dev_count == 0)
 147     {
 148         return -1;
 149     }
 150
 151     /* things might go horribly wrong if cudart is not compatible with the driver */
 152     if (dev_count < 0 || dev_count > cuda_max_device_count)
 153     {
 154         return -1;
 155     }
 156
 157     if (dev_id == -1) /* device already selected let's not destroy the context */
 158     {
 159         cu_err = cudaGetDevice(&id);
 160         if (cu_err != cudaSuccess)
 161         {
 162             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 163             return -1;
 164         }
 165     }
 166     else
 167     {
 168         id = dev_id;
 169         if (id > dev_count - 1) /* pfff there's no such device */
 170         {
 171             fprintf(stderr,
 172                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 173                     dev_id, dev_count);
 174             return -1;
 175         }
 176     }
 177
 178     /* both major & minor is 9999 if no CUDA capable devices are present */
 179     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 180     {
 181         return -1;
 182     }
 183     /* we don't care about emulation mode */
 184     if (dev_prop.major == 0)
 185     {
 186         return -1;
 187     }
 188
 189     if (id != -1)
 190     {
 191         cu_err = cudaSetDevice(id);
 192         if (cu_err != cudaSuccess)
 193         {
 194             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 195                     cudaGetErrorString(cu_err));
 196             return -1;
 197         }
 198     }
 199
 200     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 201     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 202     // it is enough to check for cudaErrorDevicesUnavailable only here because
 203     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 204     if (cu_err == cudaErrorDevicesUnavailable)
 205     {
 206         return -2;
 207     }
 208     else if (cu_err != cudaSuccess)
 209     {
 210         return -1;
 211     }
 212
 213     /* try to execute a dummy kernel */
 214     try
 215     {
 216         KernelLaunchConfig config;
 217         config.blockSize[0]       = 512;
 218         const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 219         launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
 220     }
 221     catch (gmx::GromacsException& ex)
 222     {
 223         // launchGpuKernel error is not fatal and should continue with marking the device bad
 224         fprintf(stderr,
 225                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
 226                 formatExceptionMessageToString(ex).c_str());
 227         return -1;
 228     }
 229
 230     if (cudaDeviceSynchronize() != cudaSuccess)
 231     {
 232         return -1;
 233     }
 234
 235     /* destroy context if we created one */
 236     if (id != -1)
 237     {
 238         cu_err = cudaDeviceReset();
 239         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 240     }
 241
 242     return 0;
 243 }
 244
 245 void init_gpu(const gmx_device_info_t* deviceInfo)
 246 {
 247     cudaError_t stat;
 248
 249     assert(deviceInfo);
 250
 251     stat = cudaSetDevice(deviceInfo->id);
 252     if (stat != cudaSuccess)
 253     {
 254         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 255         CU_RET_ERR(stat, message.c_str());
 256     }
 257
 258     if (debug)
 259     {
 260         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 261     }
 262 }
 263
 264 void free_gpu(const gmx_device_info_t* deviceInfo)
 265 {
 266     // One should only attempt to clear the device context when
 267     // it has been used, but currently the only way to know that a GPU
 268     // device was used is that deviceInfo will be non-null.
 269     if (deviceInfo == nullptr)
 270     {
 271         return;
 272     }
 273
 274     cudaError_t stat;
 275
 276     if (debug)
 277     {
 278         int gpuid;
 279         stat = cudaGetDevice(&gpuid);
 280         CU_RET_ERR(stat, "cudaGetDevice failed");
 281         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 282     }
 283
 284     stat = cudaDeviceReset();
 285     if (stat != cudaSuccess)
 286     {
 287         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 288     }
 289 }
 290
 291 gmx_device_info_t* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
 292 {
 293     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 294     {
 295         gmx_incons("Invalid GPU deviceId requested");
 296     }
 297     return &gpu_info.gpu_dev[deviceId];
 298 }
 299
 300 /*! \brief Returns true if the gpu characterized by the device properties is
 301  *  supported by the native gpu acceleration.
 302  *
 303  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 304  * \returns             true if the GPU properties passed indicate a compatible
 305  *                      GPU, otherwise false.
 306  */
 307 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
 308 {
 309     return (dev_prop.major >= 3);
 310 }
 311
 312 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 313  *
 314  *  Returns a status value which indicates compatibility or one of the following
 315  *  errors: incompatibility or insanity (=unexpected behavior).
 316  *
 317  *  As the error handling only permits returning the state of the GPU, this function
 318  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 319  *  upon return. Note that this also means it is the caller's responsibility to
 320  *  reset the CUDA runtime state.
 321  *
 322  *  \param[in]  deviceId   the ID of the GPU to check.
 323  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 324  *  \returns               the status of the requested device
 325  */
 326 static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
 327 {
 328     if (!is_gmx_supported_gpu(deviceProp))
 329     {
 330         return egpuIncompatible;
 331     }
 332
 333     /* TODO: currently we do not make a distinction between the type of errors
 334      * that can appear during sanity checks. This needs to be improved, e.g if
 335      * the dummy test kernel fails to execute with a "device busy message" we
 336      * should appropriately report that the device is busy instead of insane.
 337      */
 338     const int checkResult = do_sanity_checks(deviceId, deviceProp);
 339     switch (checkResult)
 340     {
 341         case 0: return egpuCompatible;
 342         case -1: return egpuInsane;
 343         case -2: return egpuUnavailable;
 344         default:
 345             GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
 346             return egpuCompatible;
 347     }
 348 }
 349
 350 bool isGpuDetectionFunctional(std::string* errorMessage)
 351 {
 352     cudaError_t stat;
 353     int         driverVersion = -1;
 354     stat                      = cudaDriverGetVersion(&driverVersion);
 355     GMX_ASSERT(stat != cudaErrorInvalidValue,
 356                "An impossible null pointer was passed to cudaDriverGetVersion");
 357     GMX_RELEASE_ASSERT(
 358             stat == cudaSuccess,
 359             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 360                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 361                     .c_str());
 362     bool foundDriver = (driverVersion > 0);
 363     if (!foundDriver)
 364     {
 365         // Can't detect GPUs if there is no driver
 366         if (errorMessage != nullptr)
 367         {
 368             errorMessage->assign("No valid CUDA driver found");
 369         }
 370         return false;
 371     }
 372
 373     int numDevices;
 374     stat = cudaGetDeviceCount(&numDevices);
 375     if (stat != cudaSuccess)
 376     {
 377         if (errorMessage != nullptr)
 378         {
 379             /* cudaGetDeviceCount failed which means that there is
 380              * something wrong with the machine: driver-runtime
 381              * mismatch, all GPUs being busy in exclusive mode,
 382              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 383              * which should result in GROMACS issuing at least a
 384              * warning. */
 385             errorMessage->assign(cudaGetErrorString(stat));
 386         }
 387
 388         // Consume the error now that we have prepared to handle
 389         // it. This stops it reappearing next time we check for
 390         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 391         // valid devices, then cudaGetLastError returns the
 392         // (undocumented) cudaErrorNoDevice, but this should not be a
 393         // problem as there should be no future CUDA API calls.
 394         // NVIDIA bug report #2038718 has been filed.
 395         cudaGetLastError();
 396         // Can't detect GPUs
 397         return false;
 398     }
 399
 400     // We don't actually use numDevices here, that's not the job of
 401     // this function.
 402     return true;
 403 }
 404
 405 void findGpus(gmx_gpu_info_t* gpu_info)
 406 {
 407     assert(gpu_info);
 408
 409     gpu_info->n_dev_compatible = 0;
 410
 411     int         ndev;
 412     cudaError_t stat = cudaGetDeviceCount(&ndev);
 413     if (stat != cudaSuccess)
 414     {
 415         GMX_THROW(gmx::InternalError(
 416                 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
 417                 "canDetectGpus() was not called appropriately beforehand."));
 418     }
 419
 420     // We expect to start device support/sanity checks with a clean runtime error state
 421     gmx::ensureNoPendingCudaError("");
 422
 423     gmx_device_info_t* devs;
 424     snew(devs, ndev);
 425     for (int i = 0; i < ndev; i++)
 426     {
 427         cudaDeviceProp prop;
 428         memset(&prop, 0, sizeof(cudaDeviceProp));
 429         stat = cudaGetDeviceProperties(&prop, i);
 430         int checkResult;
 431         if (stat != cudaSuccess)
 432         {
 433             // Will handle the error reporting below
 434             checkResult = egpuInsane;
 435         }
 436         else
 437         {
 438             checkResult = is_gmx_supported_gpu_id(i, prop);
 439         }
 440
 441         devs[i].id   = i;
 442         devs[i].prop = prop;
 443         devs[i].stat = checkResult;
 444
 445         if (checkResult == egpuCompatible)
 446         {
 447             gpu_info->n_dev_compatible++;
 448         }
 449         else
 450         {
 451             // TODO:
 452             //  - we inspect the CUDA API state to retrieve and record any
 453             //    errors that occurred during is_gmx_supported_gpu_id() here,
 454             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 455             //    and only return a string with the error if one was encountered.
 456             //  - we'll be reporting without rank information which is not ideal.
 457             //  - we'll end up warning also in cases where users would already
 458             //    get an error before mdrun aborts.
 459             //
 460             // Here we also clear the CUDA API error state so potential
 461             // errors during sanity checks don't propagate.
 462             if ((stat = cudaGetLastError()) != cudaSuccess)
 463             {
 464                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 465                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 466             }
 467         }
 468     }
 469
 470     stat = cudaPeekAtLastError();
 471     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 472                        gmx::formatString("We promise to return with clean CUDA state, but "
 473                                          "non-success state encountered: %s: %s",
 474                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 475                                .c_str());
 476
 477     gpu_info->n_dev   = ndev;
 478     gpu_info->gpu_dev = devs;
 479 }
 480
 481 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
 482 {
 483     assert(s);
 484
 485     if (index < 0 && index >= gpu_info.n_dev)
 486     {
 487         return;
 488     }
 489
 490     gmx_device_info_t* dinfo = &gpu_info.gpu_dev[index];
 491
 492     bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
 493
 494     if (!bGpuExists)
 495     {
 496         sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
 497     }
 498     else
 499     {
 500         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
 501                 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
 502                 dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
 503     }
 504 }
 505
 506 int get_current_cuda_gpu_device_id(void)
 507 {
 508     int gpuid;
 509     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 510
 511     return gpuid;
 512 }
 513
 514 size_t sizeof_gpu_dev_info(void)
 515 {
 516     return sizeof(gmx_device_info_t);
 517 }
 518
 519 void startGpuProfiler(void)
 520 {
 521     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 522        mdrun is executed in the CUDA profiler.
 523        If nvprof was run is with "--profile-from-start off", the profiler will
 524        be started here. This way we can avoid tracing the CUDA events from the
 525        first part of the run. Starting the profiler again does nothing.
 526      */
 527     if (cudaProfilerRun)
 528     {
 529         cudaError_t stat;
 530         stat = cudaProfilerStart();
 531         CU_RET_ERR(stat, "cudaProfilerStart failed");
 532     }
 533 }
 534
 535 void stopGpuProfiler(void)
 536 {
 537     /* Stopping the nvidia here allows us to eliminate the subsequent
 538        API calls from the trace, e.g. uninitialization and cleanup. */
 539     if (cudaProfilerRun)
 540     {
 541         cudaError_t stat;
 542         stat = cudaProfilerStop();
 543         CU_RET_ERR(stat, "cudaProfilerStop failed");
 544     }
 545 }
 546
 547 void resetGpuProfiler(void)
 548 {
 549     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 550      *  the profiling here (can't stop it) which will achieve the desired effect if
 551      *  the run was started with the profiling disabled.
 552      *
 553      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 554      * stopGpuProfiler();
 555      */
 556     if (cudaProfilerRun)
 557     {
 558         startGpuProfiler();
 559     }
 560 }
 561
 562 int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
 563 {
 564     return info.gpu_dev[index].stat;
 565 }
 566
 567 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
 568  * \param[in] stat           CUDA call return status
 569  * \param[in] gpuA           ID for GPU initiating peer access call
 570  * \param[in] gpuB           ID for remote GPU
 571  * \param[in] mdlog          Logger object
 572  * \param[in] cudaCallName   name of CUDA peer access call
 573  */
 574 static void peerAccessCheckStat(const cudaError_t    stat,
 575                                 const int            gpuA,
 576                                 const int            gpuB,
 577                                 const gmx::MDLogger& mdlog,
 578                                 const char*          cudaCallName)
 579 {
 580     if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
 581     {
 582         std::string errorString =
 583                 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
 584         CU_RET_ERR(stat, errorString.c_str());
 585     }
 586     if (stat != cudaSuccess)
 587     {
 588         GMX_LOG(mdlog.warning)
 589                 .asParagraph()
 590                 .appendTextFormatted(
 591                         "GPU peer access not enabled between GPUs %d and %d due to unexpected "
 592                         "return value from %s: %s",
 593                         gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
 594     }
 595 }
 596
 597 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
 598 {
 599     cudaError_t stat;
 600
 601     // take a note of currently-set GPU
 602     int currentGpu;
 603     stat = cudaGetDevice(&currentGpu);
 604     CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
 605
 606     std::string message = gmx::formatString(
 607             "Note: Peer access enabled between the following GPU pairs in the node:\n ");
 608     bool peerAccessEnabled = false;
 609
 610     for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
 611     {
 612         int gpuA = gpuIdsToUse[i];
 613         stat     = cudaSetDevice(gpuA);
 614         if (stat != cudaSuccess)
 615         {
 616             GMX_LOG(mdlog.warning)
 617                     .asParagraph()
 618                     .appendTextFormatted(
 619                             "GPU peer access not enabled due to unexpected return value from "
 620                             "cudaSetDevice(%d): %s",
 621                             gpuA, cudaGetErrorString(stat));
 622             return;
 623         }
 624         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
 625         {
 626             if (j != i)
 627             {
 628                 int gpuB          = gpuIdsToUse[j];
 629                 int canAccessPeer = 0;
 630                 stat              = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
 631                 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
 632
 633                 if (canAccessPeer)
 634                 {
 635                     stat = cudaDeviceEnablePeerAccess(gpuB, 0);
 636                     peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
 637
 638                     message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
 639                     peerAccessEnabled = true;
 640                 }
 641             }
 642         }
 643     }
 644
 645     // re-set GPU to that originally set
 646     stat = cudaSetDevice(currentGpu);
 647     if (stat != cudaSuccess)
 648     {
 649         CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
 650         return;
 651     }
 652
 653     if (peerAccessEnabled)
 654     {
 655         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
 656     }
 657 }