src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, The GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define functions for detection and initialization for CUDA devices.
  38  *
  39  *  \author Szilard Pall <pall.szilard@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "gpu_utils.h"
  45
  46 #include <assert.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49
  50 #include <cuda_profiler_api.h>
  51
  52 #include "gromacs/gpu_utils/cudautils.cuh"
  53 #include "gromacs/gpu_utils/device_context.h"
  54 #include "gromacs/gpu_utils/device_stream.h"
  55 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  56 #include "gromacs/hardware/gpu_hw_info.h"
  57 #include "gromacs/utility/basedefinitions.h"
  58 #include "gromacs/utility/cstringutil.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/fatalerror.h"
  61 #include "gromacs/utility/gmxassert.h"
  62 #include "gromacs/utility/logger.h"
  63 #include "gromacs/utility/programcontext.h"
  64 #include "gromacs/utility/smalloc.h"
  65 #include "gromacs/utility/snprintf.h"
  66 #include "gromacs/utility/stringutil.h"
  67
  68 /*! \internal \brief
  69  * Max number of devices supported by CUDA (for consistency checking).
  70  *
  71  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  72  */
  73 static int cuda_max_device_count = 32;
  74
  75 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  76
  77 /** Dummy kernel used for sanity checking. */
  78 static __global__ void k_dummy_test(void) {}
  79
  80 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  81 {
  82     cudaFuncAttributes attributes;
  83     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  84
  85     if (cudaErrorInvalidDeviceFunction == stat)
  86     {
  87         fprintf(stderr,
  88                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  89                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  90                 "By default, GROMACS supports all architectures of compute "
  91                 "capability >= 3.0, so your GPU "
  92                 "might be rare, or some architectures were disabled in the build. \n"
  93                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  94                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  95                 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
  96     }
  97
  98     return stat;
  99 }
 100
 101 bool isHostMemoryPinned(const void* h_ptr)
 102 {
 103     cudaPointerAttributes memoryAttributes;
 104     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 105
 106     bool result = false;
 107     switch (stat)
 108     {
 109         case cudaSuccess: result = true; break;
 110
 111         case cudaErrorInvalidValue:
 112             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 113             result = false;
 114             // Reset the last error status
 115             cudaGetLastError();
 116             break;
 117
 118         default: CU_RET_ERR(stat, "Unexpected CUDA error");
 119     }
 120     return result;
 121 }
 122
 123 /*!
 124  * \brief Runs GPU sanity checks.
 125  *
 126  * Runs a series of checks to determine that the given GPU and underlying CUDA
 127  * driver/runtime functions properly.
 128  *
 129  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 130  * \param[in]  dev_prop    The device properties structure
 131  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 132  *
 133  * TODO: introduce errors codes and handle errors more smoothly.
 134  */
 135 static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
 136 {
 137     cudaError_t cu_err;
 138     int         dev_count, id;
 139
 140     cu_err = cudaGetDeviceCount(&dev_count);
 141     if (cu_err != cudaSuccess)
 142     {
 143         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 144         return -1;
 145     }
 146
 147     /* no CUDA compatible device at all */
 148     if (dev_count == 0)
 149     {
 150         return -1;
 151     }
 152
 153     /* things might go horribly wrong if cudart is not compatible with the driver */
 154     if (dev_count < 0 || dev_count > cuda_max_device_count)
 155     {
 156         return -1;
 157     }
 158
 159     if (dev_id == -1) /* device already selected let's not destroy the context */
 160     {
 161         cu_err = cudaGetDevice(&id);
 162         if (cu_err != cudaSuccess)
 163         {
 164             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 165             return -1;
 166         }
 167     }
 168     else
 169     {
 170         id = dev_id;
 171         if (id > dev_count - 1) /* pfff there's no such device */
 172         {
 173             fprintf(stderr,
 174                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 175                     dev_id, dev_count);
 176             return -1;
 177         }
 178     }
 179
 180     /* both major & minor is 9999 if no CUDA capable devices are present */
 181     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 182     {
 183         return -1;
 184     }
 185     /* we don't care about emulation mode */
 186     if (dev_prop.major == 0)
 187     {
 188         return -1;
 189     }
 190
 191     if (id != -1)
 192     {
 193         cu_err = cudaSetDevice(id);
 194         if (cu_err != cudaSuccess)
 195         {
 196             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 197                     cudaGetErrorString(cu_err));
 198             return -1;
 199         }
 200     }
 201
 202     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 203     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 204     // it is enough to check for cudaErrorDevicesUnavailable only here because
 205     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 206     if (cu_err == cudaErrorDevicesUnavailable)
 207     {
 208         return -2;
 209     }
 210     else if (cu_err != cudaSuccess)
 211     {
 212         return -1;
 213     }
 214
 215     /* try to execute a dummy kernel */
 216     try
 217     {
 218         KernelLaunchConfig config;
 219         config.blockSize[0]                = 512;
 220         const auto          dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 221         DeviceInformation   deviceInfo;
 222         const DeviceContext deviceContext(deviceInfo);
 223         const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 224         launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
 225     }
 226     catch (gmx::GromacsException& ex)
 227     {
 228         // launchGpuKernel error is not fatal and should continue with marking the device bad
 229         fprintf(stderr,
 230                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
 231                 formatExceptionMessageToString(ex).c_str());
 232         return -1;
 233     }
 234
 235     if (cudaDeviceSynchronize() != cudaSuccess)
 236     {
 237         return -1;
 238     }
 239
 240     /* destroy context if we created one */
 241     if (id != -1)
 242     {
 243         cu_err = cudaDeviceReset();
 244         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 245     }
 246
 247     return 0;
 248 }
 249
 250 void init_gpu(const DeviceInformation* deviceInfo)
 251 {
 252     cudaError_t stat;
 253
 254     assert(deviceInfo);
 255
 256     stat = cudaSetDevice(deviceInfo->id);
 257     if (stat != cudaSuccess)
 258     {
 259         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 260         CU_RET_ERR(stat, message.c_str());
 261     }
 262
 263     if (debug)
 264     {
 265         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 266     }
 267 }
 268
 269 void free_gpu(const DeviceInformation* deviceInfo)
 270 {
 271     // One should only attempt to clear the device context when
 272     // it has been used, but currently the only way to know that a GPU
 273     // device was used is that deviceInfo will be non-null.
 274     if (deviceInfo == nullptr)
 275     {
 276         return;
 277     }
 278
 279     cudaError_t stat;
 280
 281     if (debug)
 282     {
 283         int gpuid;
 284         stat = cudaGetDevice(&gpuid);
 285         CU_RET_ERR(stat, "cudaGetDevice failed");
 286         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 287     }
 288
 289     stat = cudaDeviceReset();
 290     if (stat != cudaSuccess)
 291     {
 292         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 293     }
 294 }
 295
 296 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
 297 {
 298     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 299     {
 300         gmx_incons("Invalid GPU deviceId requested");
 301     }
 302     return &gpu_info.deviceInfo[deviceId];
 303 }
 304
 305 /*! \brief Returns true if the gpu characterized by the device properties is
 306  *  supported by the native gpu acceleration.
 307  *
 308  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 309  * \returns             true if the GPU properties passed indicate a compatible
 310  *                      GPU, otherwise false.
 311  */
 312 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
 313 {
 314     return (dev_prop.major >= 3);
 315 }
 316
 317 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 318  *
 319  *  Returns a status value which indicates compatibility or one of the following
 320  *  errors: incompatibility or insanity (=unexpected behavior).
 321  *
 322  *  As the error handling only permits returning the state of the GPU, this function
 323  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 324  *  upon return. Note that this also means it is the caller's responsibility to
 325  *  reset the CUDA runtime state.
 326  *
 327  *  \param[in]  deviceId   the ID of the GPU to check.
 328  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 329  *  \returns               the status of the requested device
 330  */
 331 static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
 332 {
 333     if (!is_gmx_supported_gpu(deviceProp))
 334     {
 335         return egpuIncompatible;
 336     }
 337
 338     /* TODO: currently we do not make a distinction between the type of errors
 339      * that can appear during sanity checks. This needs to be improved, e.g if
 340      * the dummy test kernel fails to execute with a "device busy message" we
 341      * should appropriately report that the device is busy instead of insane.
 342      */
 343     const int checkResult = do_sanity_checks(deviceId, deviceProp);
 344     switch (checkResult)
 345     {
 346         case 0: return egpuCompatible;
 347         case -1: return egpuInsane;
 348         case -2: return egpuUnavailable;
 349         default:
 350             GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
 351             return egpuCompatible;
 352     }
 353 }
 354
 355 bool isGpuDetectionFunctional(std::string* errorMessage)
 356 {
 357     cudaError_t stat;
 358     int         driverVersion = -1;
 359     stat                      = cudaDriverGetVersion(&driverVersion);
 360     GMX_ASSERT(stat != cudaErrorInvalidValue,
 361                "An impossible null pointer was passed to cudaDriverGetVersion");
 362     GMX_RELEASE_ASSERT(
 363             stat == cudaSuccess,
 364             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 365                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 366                     .c_str());
 367     bool foundDriver = (driverVersion > 0);
 368     if (!foundDriver)
 369     {
 370         // Can't detect GPUs if there is no driver
 371         if (errorMessage != nullptr)
 372         {
 373             errorMessage->assign("No valid CUDA driver found");
 374         }
 375         return false;
 376     }
 377
 378     int numDevices;
 379     stat = cudaGetDeviceCount(&numDevices);
 380     if (stat != cudaSuccess)
 381     {
 382         if (errorMessage != nullptr)
 383         {
 384             /* cudaGetDeviceCount failed which means that there is
 385              * something wrong with the machine: driver-runtime
 386              * mismatch, all GPUs being busy in exclusive mode,
 387              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 388              * which should result in GROMACS issuing at least a
 389              * warning. */
 390             errorMessage->assign(cudaGetErrorString(stat));
 391         }
 392
 393         // Consume the error now that we have prepared to handle
 394         // it. This stops it reappearing next time we check for
 395         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 396         // valid devices, then cudaGetLastError returns the
 397         // (undocumented) cudaErrorNoDevice, but this should not be a
 398         // problem as there should be no future CUDA API calls.
 399         // NVIDIA bug report #2038718 has been filed.
 400         cudaGetLastError();
 401         // Can't detect GPUs
 402         return false;
 403     }
 404
 405     // We don't actually use numDevices here, that's not the job of
 406     // this function.
 407     return true;
 408 }
 409
 410 void findGpus(gmx_gpu_info_t* gpu_info)
 411 {
 412     assert(gpu_info);
 413
 414     gpu_info->n_dev_compatible = 0;
 415
 416     int         ndev;
 417     cudaError_t stat = cudaGetDeviceCount(&ndev);
 418     if (stat != cudaSuccess)
 419     {
 420         GMX_THROW(gmx::InternalError(
 421                 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
 422                 "canDetectGpus() was not called appropriately beforehand."));
 423     }
 424
 425     // We expect to start device support/sanity checks with a clean runtime error state
 426     gmx::ensureNoPendingCudaError("");
 427
 428     DeviceInformation* devs;
 429     snew(devs, ndev);
 430     for (int i = 0; i < ndev; i++)
 431     {
 432         cudaDeviceProp prop;
 433         memset(&prop, 0, sizeof(cudaDeviceProp));
 434         stat = cudaGetDeviceProperties(&prop, i);
 435         int checkResult;
 436         if (stat != cudaSuccess)
 437         {
 438             // Will handle the error reporting below
 439             checkResult = egpuInsane;
 440         }
 441         else
 442         {
 443             checkResult = is_gmx_supported_gpu_id(i, prop);
 444         }
 445
 446         devs[i].id   = i;
 447         devs[i].prop = prop;
 448         devs[i].stat = checkResult;
 449
 450         if (checkResult == egpuCompatible)
 451         {
 452             gpu_info->n_dev_compatible++;
 453         }
 454         else
 455         {
 456             // TODO:
 457             //  - we inspect the CUDA API state to retrieve and record any
 458             //    errors that occurred during is_gmx_supported_gpu_id() here,
 459             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 460             //    and only return a string with the error if one was encountered.
 461             //  - we'll be reporting without rank information which is not ideal.
 462             //  - we'll end up warning also in cases where users would already
 463             //    get an error before mdrun aborts.
 464             //
 465             // Here we also clear the CUDA API error state so potential
 466             // errors during sanity checks don't propagate.
 467             if ((stat = cudaGetLastError()) != cudaSuccess)
 468             {
 469                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 470                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 471             }
 472         }
 473     }
 474
 475     stat = cudaPeekAtLastError();
 476     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 477                        gmx::formatString("We promise to return with clean CUDA state, but "
 478                                          "non-success state encountered: %s: %s",
 479                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 480                                .c_str());
 481
 482     gpu_info->n_dev      = ndev;
 483     gpu_info->deviceInfo = devs;
 484 }
 485
 486 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
 487 {
 488     assert(s);
 489
 490     if (index < 0 && index >= gpu_info.n_dev)
 491     {
 492         return;
 493     }
 494
 495     DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
 496
 497     bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
 498
 499     if (!bGpuExists)
 500     {
 501         sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
 502     }
 503     else
 504     {
 505         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
 506                 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
 507                 dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
 508     }
 509 }
 510
 511 int get_current_cuda_gpu_device_id(void)
 512 {
 513     int gpuid;
 514     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 515
 516     return gpuid;
 517 }
 518
 519 size_t sizeof_gpu_dev_info(void)
 520 {
 521     return sizeof(DeviceInformation);
 522 }
 523
 524 void startGpuProfiler(void)
 525 {
 526     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 527        mdrun is executed in the CUDA profiler.
 528        If nvprof was run is with "--profile-from-start off", the profiler will
 529        be started here. This way we can avoid tracing the CUDA events from the
 530        first part of the run. Starting the profiler again does nothing.
 531      */
 532     if (cudaProfilerRun)
 533     {
 534         cudaError_t stat;
 535         stat = cudaProfilerStart();
 536         CU_RET_ERR(stat, "cudaProfilerStart failed");
 537     }
 538 }
 539
 540 void stopGpuProfiler(void)
 541 {
 542     /* Stopping the nvidia here allows us to eliminate the subsequent
 543        API calls from the trace, e.g. uninitialization and cleanup. */
 544     if (cudaProfilerRun)
 545     {
 546         cudaError_t stat;
 547         stat = cudaProfilerStop();
 548         CU_RET_ERR(stat, "cudaProfilerStop failed");
 549     }
 550 }
 551
 552 void resetGpuProfiler(void)
 553 {
 554     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 555      *  the profiling here (can't stop it) which will achieve the desired effect if
 556      *  the run was started with the profiling disabled.
 557      *
 558      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 559      * stopGpuProfiler();
 560      */
 561     if (cudaProfilerRun)
 562     {
 563         startGpuProfiler();
 564     }
 565 }
 566
 567 int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
 568 {
 569     return info.deviceInfo[index].stat;
 570 }
 571
 572 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
 573  * \param[in] stat           CUDA call return status
 574  * \param[in] gpuA           ID for GPU initiating peer access call
 575  * \param[in] gpuB           ID for remote GPU
 576  * \param[in] mdlog          Logger object
 577  * \param[in] cudaCallName   name of CUDA peer access call
 578  */
 579 static void peerAccessCheckStat(const cudaError_t    stat,
 580                                 const int            gpuA,
 581                                 const int            gpuB,
 582                                 const gmx::MDLogger& mdlog,
 583                                 const char*          cudaCallName)
 584 {
 585     if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
 586     {
 587         std::string errorString =
 588                 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
 589         CU_RET_ERR(stat, errorString.c_str());
 590     }
 591     if (stat != cudaSuccess)
 592     {
 593         GMX_LOG(mdlog.warning)
 594                 .asParagraph()
 595                 .appendTextFormatted(
 596                         "GPU peer access not enabled between GPUs %d and %d due to unexpected "
 597                         "return value from %s: %s",
 598                         gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
 599     }
 600 }
 601
 602 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
 603 {
 604     cudaError_t stat;
 605
 606     // take a note of currently-set GPU
 607     int currentGpu;
 608     stat = cudaGetDevice(&currentGpu);
 609     CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
 610
 611     std::string message = gmx::formatString(
 612             "Note: Peer access enabled between the following GPU pairs in the node:\n ");
 613     bool peerAccessEnabled = false;
 614
 615     for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
 616     {
 617         int gpuA = gpuIdsToUse[i];
 618         stat     = cudaSetDevice(gpuA);
 619         if (stat != cudaSuccess)
 620         {
 621             GMX_LOG(mdlog.warning)
 622                     .asParagraph()
 623                     .appendTextFormatted(
 624                             "GPU peer access not enabled due to unexpected return value from "
 625                             "cudaSetDevice(%d): %s",
 626                             gpuA, cudaGetErrorString(stat));
 627             return;
 628         }
 629         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
 630         {
 631             if (j != i)
 632             {
 633                 int gpuB          = gpuIdsToUse[j];
 634                 int canAccessPeer = 0;
 635                 stat              = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
 636                 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
 637
 638                 if (canAccessPeer)
 639                 {
 640                     stat = cudaDeviceEnablePeerAccess(gpuB, 0);
 641                     peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
 642
 643                     message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
 644                     peerAccessEnabled = true;
 645                 }
 646             }
 647         }
 648     }
 649
 650     // re-set GPU to that originally set
 651     stat = cudaSetDevice(currentGpu);
 652     if (stat != cudaSuccess)
 653     {
 654         CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
 655         return;
 656     }
 657
 658     if (peerAccessEnabled)
 659     {
 660         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
 661     }
 662 }