src/gromacs/gpu_utils/gpu_utils.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, The GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define functions for detection and initialization for CUDA devices.
  38  *
  39  *  \author Szilard Pall <pall.szilard@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "gpu_utils.h"
  45
  46 #include <assert.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49
  50 #include <cuda_profiler_api.h>
  51
  52 #include "gromacs/gpu_utils/cudautils.cuh"
  53 #include "gromacs/gpu_utils/device_context.h"
  54 #include "gromacs/gpu_utils/device_stream.h"
  55 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  56 #include "gromacs/hardware/gpu_hw_info.h"
  57 #include "gromacs/utility/basedefinitions.h"
  58 #include "gromacs/utility/cstringutil.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/fatalerror.h"
  61 #include "gromacs/utility/gmxassert.h"
  62 #include "gromacs/utility/logger.h"
  63 #include "gromacs/utility/programcontext.h"
  64 #include "gromacs/utility/smalloc.h"
  65 #include "gromacs/utility/snprintf.h"
  66 #include "gromacs/utility/stringutil.h"
  67
  68 /*! \internal \brief
  69  * Max number of devices supported by CUDA (for consistency checking).
  70  *
  71  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  72  */
  73 static int cuda_max_device_count = 32;
  74
  75 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  76
  77 /** Dummy kernel used for sanity checking. */
  78 static __global__ void k_dummy_test(void) {}
  79
  80 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  81 {
  82     cudaFuncAttributes attributes;
  83     cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  84
  85     if (cudaErrorInvalidDeviceFunction == stat)
  86     {
  87         fprintf(stderr,
  88                 "\nWARNING: The %s binary does not include support for the CUDA architecture of "
  89                 "the GPU ID #%d (compute capability %d.%d) detected during detection. "
  90                 "By default, GROMACS supports all architectures of compute "
  91                 "capability >= 3.0, so your GPU "
  92                 "might be rare, or some architectures were disabled in the build. \n"
  93                 "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
  94                 "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
  95                 gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
  96     }
  97
  98     return stat;
  99 }
 100
 101 bool isHostMemoryPinned(const void* h_ptr)
 102 {
 103     cudaPointerAttributes memoryAttributes;
 104     cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
 105
 106     bool result = false;
 107     switch (stat)
 108     {
 109         case cudaSuccess: result = true; break;
 110
 111         case cudaErrorInvalidValue:
 112             // If the buffer was not pinned, then it will not be recognized by CUDA at all
 113             result = false;
 114             // Reset the last error status
 115             cudaGetLastError();
 116             break;
 117
 118         default: CU_RET_ERR(stat, "Unexpected CUDA error");
 119     }
 120     return result;
 121 }
 122
 123 /*!
 124  * \brief Runs GPU sanity checks.
 125  *
 126  * Runs a series of checks to determine that the given GPU and underlying CUDA
 127  * driver/runtime functions properly.
 128  *
 129  * \todo Currently we do not make a distinction between the type of errors
 130  *       that can appear during functionality checks. This needs to be improved,
 131  *       e.g if the dummy test kernel fails to execute with a "device busy message"
 132  *       we should appropriately report that the device is busy instead of NonFunctional.
 133  *
 134  * \todo Introduce errors codes and handle errors more smoothly.
 135  *
 136  *
 137  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 138  * \param[in]  dev_prop    The device properties structure
 139  * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
 140  */
 141 static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
 142 {
 143     cudaError_t cu_err;
 144     int         dev_count, id;
 145
 146     cu_err = cudaGetDeviceCount(&dev_count);
 147     if (cu_err != cudaSuccess)
 148     {
 149         fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
 150         return DeviceStatus::NonFunctional;
 151     }
 152
 153     /* no CUDA compatible device at all */
 154     if (dev_count == 0)
 155     {
 156         return DeviceStatus::NonFunctional;
 157     }
 158
 159     /* things might go horribly wrong if cudart is not compatible with the driver */
 160     if (dev_count < 0 || dev_count > cuda_max_device_count)
 161     {
 162         return DeviceStatus::NonFunctional;
 163     }
 164
 165     if (dev_id == -1) /* device already selected let's not destroy the context */
 166     {
 167         cu_err = cudaGetDevice(&id);
 168         if (cu_err != cudaSuccess)
 169         {
 170             fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
 171             return DeviceStatus::NonFunctional;
 172         }
 173     }
 174     else
 175     {
 176         id = dev_id;
 177         if (id > dev_count - 1) /* pfff there's no such device */
 178         {
 179             fprintf(stderr,
 180                     "The requested device with id %d does not seem to exist (device count=%d)\n",
 181                     dev_id, dev_count);
 182             return DeviceStatus::NonFunctional;
 183         }
 184     }
 185
 186     /* both major & minor is 9999 if no CUDA capable devices are present */
 187     if (dev_prop.major == 9999 && dev_prop.minor == 9999)
 188     {
 189         return DeviceStatus::NonFunctional;
 190     }
 191     /* we don't care about emulation mode */
 192     if (dev_prop.major == 0)
 193     {
 194         return DeviceStatus::NonFunctional;
 195     }
 196
 197     if (id != -1)
 198     {
 199         cu_err = cudaSetDevice(id);
 200         if (cu_err != cudaSuccess)
 201         {
 202             fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
 203                     cudaGetErrorString(cu_err));
 204             return DeviceStatus::NonFunctional;
 205         }
 206     }
 207
 208     cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
 209     // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
 210     // it is enough to check for cudaErrorDevicesUnavailable only here because
 211     // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
 212     if (cu_err == cudaErrorDevicesUnavailable)
 213     {
 214         return DeviceStatus::Unavailable;
 215     }
 216     else if (cu_err != cudaSuccess)
 217     {
 218         return DeviceStatus::NonFunctional;
 219     }
 220
 221     /* try to execute a dummy kernel */
 222     try
 223     {
 224         KernelLaunchConfig config;
 225         config.blockSize[0]                = 512;
 226         const auto          dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
 227         DeviceInformation   deviceInfo;
 228         const DeviceContext deviceContext(deviceInfo);
 229         const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 230         launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
 231     }
 232     catch (gmx::GromacsException& ex)
 233     {
 234         // launchGpuKernel error is not fatal and should continue with marking the device bad
 235         fprintf(stderr,
 236                 "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
 237                 formatExceptionMessageToString(ex).c_str());
 238         return DeviceStatus::NonFunctional;
 239     }
 240
 241     if (cudaDeviceSynchronize() != cudaSuccess)
 242     {
 243         return DeviceStatus::NonFunctional;
 244     }
 245
 246     /* destroy context if we created one */
 247     if (id != -1)
 248     {
 249         cu_err = cudaDeviceReset();
 250         CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 251     }
 252
 253     return DeviceStatus::Compatible;
 254 }
 255
 256 void init_gpu(const DeviceInformation* deviceInfo)
 257 {
 258     cudaError_t stat;
 259
 260     assert(deviceInfo);
 261
 262     stat = cudaSetDevice(deviceInfo->id);
 263     if (stat != cudaSuccess)
 264     {
 265         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
 266         CU_RET_ERR(stat, message.c_str());
 267     }
 268
 269     if (debug)
 270     {
 271         fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
 272     }
 273 }
 274
 275 void free_gpu(const DeviceInformation* deviceInfo)
 276 {
 277     // One should only attempt to clear the device context when
 278     // it has been used, but currently the only way to know that a GPU
 279     // device was used is that deviceInfo will be non-null.
 280     if (deviceInfo == nullptr)
 281     {
 282         return;
 283     }
 284
 285     cudaError_t stat;
 286
 287     if (debug)
 288     {
 289         int gpuid;
 290         stat = cudaGetDevice(&gpuid);
 291         CU_RET_ERR(stat, "cudaGetDevice failed");
 292         fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 293     }
 294
 295     stat = cudaDeviceReset();
 296     if (stat != cudaSuccess)
 297     {
 298         gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
 299     }
 300 }
 301
 302 DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
 303 {
 304     if (deviceId < 0 || deviceId >= gpu_info.n_dev)
 305     {
 306         gmx_incons("Invalid GPU deviceId requested");
 307     }
 308     return &gpu_info.deviceInfo[deviceId];
 309 }
 310
 311 /*! \brief Returns true if the gpu characterized by the device properties is
 312  *  supported by the native gpu acceleration.
 313  *
 314  * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 315  * \returns             true if the GPU properties passed indicate a compatible
 316  *                      GPU, otherwise false.
 317  */
 318 static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
 319 {
 320     return (dev_prop.major >= 3);
 321 }
 322
 323 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 324  *
 325  *  Returns a status value which indicates compatibility or one of the following
 326  *  errors: incompatibility or insanity (=unexpected behavior).
 327  *
 328  *  As the error handling only permits returning the state of the GPU, this function
 329  *  does not clear the CUDA runtime API status allowing the caller to inspect the error
 330  *  upon return. Note that this also means it is the caller's responsibility to
 331  *  reset the CUDA runtime state.
 332  *
 333  *  \param[in]  deviceId   the ID of the GPU to check.
 334  *  \param[in]  deviceProp the CUDA device properties of the device checked.
 335  *  \returns               the status of the requested device
 336  */
 337 static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
 338 {
 339     if (!is_gmx_supported_gpu(deviceProp))
 340     {
 341         return DeviceStatus::Incompatible;
 342     }
 343     return isDeviceFunctional(deviceId, deviceProp);
 344 }
 345
 346 bool isGpuDetectionFunctional(std::string* errorMessage)
 347 {
 348     cudaError_t stat;
 349     int         driverVersion = -1;
 350     stat                      = cudaDriverGetVersion(&driverVersion);
 351     GMX_ASSERT(stat != cudaErrorInvalidValue,
 352                "An impossible null pointer was passed to cudaDriverGetVersion");
 353     GMX_RELEASE_ASSERT(
 354             stat == cudaSuccess,
 355             gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
 356                               cudaGetErrorName(stat), cudaGetErrorString(stat))
 357                     .c_str());
 358     bool foundDriver = (driverVersion > 0);
 359     if (!foundDriver)
 360     {
 361         // Can't detect GPUs if there is no driver
 362         if (errorMessage != nullptr)
 363         {
 364             errorMessage->assign("No valid CUDA driver found");
 365         }
 366         return false;
 367     }
 368
 369     int numDevices;
 370     stat = cudaGetDeviceCount(&numDevices);
 371     if (stat != cudaSuccess)
 372     {
 373         if (errorMessage != nullptr)
 374         {
 375             /* cudaGetDeviceCount failed which means that there is
 376              * something wrong with the machine: driver-runtime
 377              * mismatch, all GPUs being busy in exclusive mode,
 378              * invalid CUDA_VISIBLE_DEVICES, or some other condition
 379              * which should result in GROMACS issuing at least a
 380              * warning. */
 381             errorMessage->assign(cudaGetErrorString(stat));
 382         }
 383
 384         // Consume the error now that we have prepared to handle
 385         // it. This stops it reappearing next time we check for
 386         // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
 387         // valid devices, then cudaGetLastError returns the
 388         // (undocumented) cudaErrorNoDevice, but this should not be a
 389         // problem as there should be no future CUDA API calls.
 390         // NVIDIA bug report #2038718 has been filed.
 391         cudaGetLastError();
 392         // Can't detect GPUs
 393         return false;
 394     }
 395
 396     // We don't actually use numDevices here, that's not the job of
 397     // this function.
 398     return true;
 399 }
 400
 401 void findGpus(gmx_gpu_info_t* gpu_info)
 402 {
 403     assert(gpu_info);
 404
 405     gpu_info->n_dev_compatible = 0;
 406
 407     int         ndev;
 408     cudaError_t stat = cudaGetDeviceCount(&ndev);
 409     if (stat != cudaSuccess)
 410     {
 411         GMX_THROW(gmx::InternalError(
 412                 "Invalid call of findGpus() when CUDA API returned an error, perhaps "
 413                 "canDetectGpus() was not called appropriately beforehand."));
 414     }
 415
 416     // We expect to start device support/sanity checks with a clean runtime error state
 417     gmx::ensureNoPendingCudaError("");
 418
 419     DeviceInformation* devs;
 420     snew(devs, ndev);
 421     for (int i = 0; i < ndev; i++)
 422     {
 423         cudaDeviceProp prop;
 424         memset(&prop, 0, sizeof(cudaDeviceProp));
 425         stat = cudaGetDeviceProperties(&prop, i);
 426         const DeviceStatus checkResult =
 427                 (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
 428
 429         devs[i].id   = i;
 430         devs[i].prop = prop;
 431         devs[i].stat = checkResult;
 432
 433         if (checkResult == DeviceStatus::Compatible)
 434         {
 435             gpu_info->n_dev_compatible++;
 436         }
 437         else
 438         {
 439             // TODO:
 440             //  - we inspect the CUDA API state to retrieve and record any
 441             //    errors that occurred during is_gmx_supported_gpu_id() here,
 442             //    but this would be more elegant done within is_gmx_supported_gpu_id()
 443             //    and only return a string with the error if one was encountered.
 444             //  - we'll be reporting without rank information which is not ideal.
 445             //  - we'll end up warning also in cases where users would already
 446             //    get an error before mdrun aborts.
 447             //
 448             // Here we also clear the CUDA API error state so potential
 449             // errors during sanity checks don't propagate.
 450             if ((stat = cudaGetLastError()) != cudaSuccess)
 451             {
 452                 gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
 453                             devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
 454             }
 455         }
 456     }
 457
 458     stat = cudaPeekAtLastError();
 459     GMX_RELEASE_ASSERT(stat == cudaSuccess,
 460                        gmx::formatString("We promise to return with clean CUDA state, but "
 461                                          "non-success state encountered: %s: %s",
 462                                          cudaGetErrorName(stat), cudaGetErrorString(stat))
 463                                .c_str());
 464
 465     gpu_info->n_dev      = ndev;
 466     gpu_info->deviceInfo = devs;
 467 }
 468
 469 void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
 470 {
 471     assert(s);
 472
 473     if (index < 0 && index >= gpu_info.n_dev)
 474     {
 475         return;
 476     }
 477
 478     DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
 479
 480     bool bGpuExists =
 481             (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
 482
 483     if (!bGpuExists)
 484     {
 485         sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
 486     }
 487     else
 488     {
 489         sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
 490                 dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
 491                 dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
 492     }
 493 }
 494
 495 int get_current_cuda_gpu_device_id(void)
 496 {
 497     int gpuid;
 498     CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 499
 500     return gpuid;
 501 }
 502
 503 size_t sizeof_gpu_dev_info(void)
 504 {
 505     return sizeof(DeviceInformation);
 506 }
 507
 508 void startGpuProfiler(void)
 509 {
 510     /* The NVPROF_ID environment variable is set by nvprof and indicates that
 511        mdrun is executed in the CUDA profiler.
 512        If nvprof was run is with "--profile-from-start off", the profiler will
 513        be started here. This way we can avoid tracing the CUDA events from the
 514        first part of the run. Starting the profiler again does nothing.
 515      */
 516     if (cudaProfilerRun)
 517     {
 518         cudaError_t stat;
 519         stat = cudaProfilerStart();
 520         CU_RET_ERR(stat, "cudaProfilerStart failed");
 521     }
 522 }
 523
 524 void stopGpuProfiler(void)
 525 {
 526     /* Stopping the nvidia here allows us to eliminate the subsequent
 527        API calls from the trace, e.g. uninitialization and cleanup. */
 528     if (cudaProfilerRun)
 529     {
 530         cudaError_t stat;
 531         stat = cudaProfilerStop();
 532         CU_RET_ERR(stat, "cudaProfilerStop failed");
 533     }
 534 }
 535
 536 void resetGpuProfiler(void)
 537 {
 538     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
 539      *  the profiling here (can't stop it) which will achieve the desired effect if
 540      *  the run was started with the profiling disabled.
 541      *
 542      * TODO: add a stop (or replace it with reset) when this will work correctly in CUDA.
 543      * stopGpuProfiler();
 544      */
 545     if (cudaProfilerRun)
 546     {
 547         startGpuProfiler();
 548     }
 549 }
 550
 551 DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
 552 {
 553     return info.deviceInfo[index].stat;
 554 }
 555
 556 /*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
 557  * \param[in] stat           CUDA call return status
 558  * \param[in] gpuA           ID for GPU initiating peer access call
 559  * \param[in] gpuB           ID for remote GPU
 560  * \param[in] mdlog          Logger object
 561  * \param[in] cudaCallName   name of CUDA peer access call
 562  */
 563 static void peerAccessCheckStat(const cudaError_t    stat,
 564                                 const int            gpuA,
 565                                 const int            gpuB,
 566                                 const gmx::MDLogger& mdlog,
 567                                 const char*          cudaCallName)
 568 {
 569     if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
 570     {
 571         std::string errorString =
 572                 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
 573         CU_RET_ERR(stat, errorString.c_str());
 574     }
 575     if (stat != cudaSuccess)
 576     {
 577         GMX_LOG(mdlog.warning)
 578                 .asParagraph()
 579                 .appendTextFormatted(
 580                         "GPU peer access not enabled between GPUs %d and %d due to unexpected "
 581                         "return value from %s: %s",
 582                         gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
 583     }
 584 }
 585
 586 void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
 587 {
 588     cudaError_t stat;
 589
 590     // take a note of currently-set GPU
 591     int currentGpu;
 592     stat = cudaGetDevice(&currentGpu);
 593     CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
 594
 595     std::string message = gmx::formatString(
 596             "Note: Peer access enabled between the following GPU pairs in the node:\n ");
 597     bool peerAccessEnabled = false;
 598
 599     for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
 600     {
 601         int gpuA = gpuIdsToUse[i];
 602         stat     = cudaSetDevice(gpuA);
 603         if (stat != cudaSuccess)
 604         {
 605             GMX_LOG(mdlog.warning)
 606                     .asParagraph()
 607                     .appendTextFormatted(
 608                             "GPU peer access not enabled due to unexpected return value from "
 609                             "cudaSetDevice(%d): %s",
 610                             gpuA, cudaGetErrorString(stat));
 611             return;
 612         }
 613         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
 614         {
 615             if (j != i)
 616             {
 617                 int gpuB          = gpuIdsToUse[j];
 618                 int canAccessPeer = 0;
 619                 stat              = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
 620                 peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
 621
 622                 if (canAccessPeer)
 623                 {
 624                     stat = cudaDeviceEnablePeerAccess(gpuB, 0);
 625                     peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
 626
 627                     message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
 628                     peerAccessEnabled = true;
 629                 }
 630             }
 631         }
 632     }
 633
 634     // re-set GPU to that originally set
 635     stat = cudaSetDevice(currentGpu);
 636     if (stat != cudaSuccess)
 637     {
 638         CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
 639         return;
 640     }
 641
 642     if (peerAccessEnabled)
 643     {
 644         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
 645     }
 646 }