Merge branch release-2019 into release-2020

[alexxy/gromacs.git] / src / gromacs / gpu_utils / gpu_utils.cu
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index 41aedf9f13e9dc6e8a1153e9fa532785077a8747..216a930b67243c194feebecaf8948052d73ed397 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -1,7 +1,8 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015,2016, The GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -56,6 +57,7 @@
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/snprintf.h"
@@ -66,17 +68,14 @@
   *
   * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
   */
-static int  cuda_max_device_count = 32;
+static int cuda_max_device_count = 32;
  
-static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
+static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
  
  /** Dummy kernel used for sanity checking. */
-static __global__ void k_dummy_test(void)
-{
-}
+static __global__ void k_dummy_test(void) {}
  
-static cudaError_t checkCompiledTargetCompatibility(int                   deviceId,
-                                                    const cudaDeviceProp &deviceProp)
+static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  {
      cudaFuncAttributes attributes;
      cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
@@ -91,24 +90,21 @@ static cudaError_t checkCompiledTargetCompatibility(int                   device
                  "might be rare, or some architectures were disabled in the build. \n"
                  "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
                  "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
-                gmx::getProgramContext().displayName(), deviceId,
-                deviceProp.major, deviceProp.minor);
+                gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
      }
  
      return stat;
  }
  
-bool isHostMemoryPinned(const void *h_ptr)
+bool isHostMemoryPinned(const void* h_ptr)
  {
      cudaPointerAttributes memoryAttributes;
      cudaError_t           stat = cudaPointerGetAttributes(&memoryAttributes, h_ptr);
  
-    bool                  result = false;
+    bool result = false;
      switch (stat)
      {
-        case cudaSuccess:
-            result = true;
-            break;
+        case cudaSuccess: result = true; break;
  
          case cudaErrorInvalidValue:
              // If the buffer was not pinned, then it will not be recognized by CUDA at all
@@ -117,8 +113,7 @@ bool isHostMemoryPinned(const void *h_ptr)
              cudaGetLastError();
              break;
  
-        default:
-            CU_RET_ERR(stat, "Unexpected CUDA error");
+        default: CU_RET_ERR(stat, "Unexpected CUDA error");
      }
      return result;
  }
@@ -135,7 +130,7 @@ bool isHostMemoryPinned(const void *h_ptr)
   *
   * TODO: introduce errors codes and handle errors more smoothly.
   */
-static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
+static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
  {
      cudaError_t cu_err;
      int         dev_count, id;
@@ -143,8 +138,7 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
      cu_err = cudaGetDeviceCount(&dev_count);
      if (cu_err != cudaSuccess)
      {
-        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
-                cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
          return -1;
      }
  
@@ -165,8 +159,7 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
          cu_err = cudaGetDevice(&id);
          if (cu_err != cudaSuccess)
          {
-            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
-                    cudaGetErrorString(cu_err));
+            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
              return -1;
          }
      }
@@ -175,7 +168,8 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
          id = dev_id;
          if (id > dev_count - 1) /* pfff there's no such device */
          {
-            fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
+            fprintf(stderr,
+                    "The requested device with id %d does not seem to exist (device count=%d)\n",
                      dev_id, dev_count);
              return -1;
          }
@@ -197,8 +191,8 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
          cu_err = cudaSetDevice(id);
          if (cu_err != cudaSuccess)
          {
-            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
-                    cu_err, id, cudaGetErrorString(cu_err));
+            fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
+                    cudaGetErrorString(cu_err));
              return -1;
          }
      }
@@ -220,15 +214,16 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
      try
      {
          KernelLaunchConfig config;
-        config.blockSize[0] = 512;
-        const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
+        config.blockSize[0]       = 512;
+        const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
          launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
      }
-    catch (gmx::GromacsException &ex)
+    catch (gmx::GromacsException& ex)
      {
          // launchGpuKernel error is not fatal and should continue with marking the device bad
-        fprintf(stderr, "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
-                id, formatExceptionMessageToString(ex).c_str());
+        fprintf(stderr,
+                "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
+                formatExceptionMessageToString(ex).c_str());
          return -1;
      }
  
@@ -247,7 +242,7 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
      return 0;
  }
  
-void init_gpu(const gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx_device_info_t* deviceInfo)
  {
      cudaError_t stat;
  
@@ -266,7 +261,7 @@ void init_gpu(const gmx_device_info_t *deviceInfo)
      }
  }
  
-void free_gpu(const gmx_device_info_t *deviceInfo)
+void free_gpu(const gmx_device_info_t* deviceInfo)
  {
      // One should only attempt to clear the device context when
      // it has been used, but currently the only way to know that a GPU
@@ -276,7 +271,7 @@ void free_gpu(const gmx_device_info_t *deviceInfo)
          return;
      }
  
-    cudaError_t  stat;
+    cudaError_t stat;
  
      if (debug)
      {
@@ -293,8 +288,7 @@ void free_gpu(const gmx_device_info_t *deviceInfo)
      }
  }
  
-gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
-                                 int                   deviceId)
+gmx_device_info_t* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
  {
      if (deviceId < 0 || deviceId >= gpu_info.n_dev)
      {
@@ -310,7 +304,7 @@ gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
   * \returns             true if the GPU properties passed indicate a compatible
   *                      GPU, otherwise false.
   */
-static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
+static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
  {
      return (dev_prop.major >= 3);
  }
@@ -329,8 +323,7 @@ static bool is_gmx_supported_gpu(const cudaDeviceProp &dev_prop)
   *  \param[in]  deviceProp the CUDA device properties of the device checked.
   *  \returns               the status of the requested device
   */
-static int is_gmx_supported_gpu_id(int                   deviceId,
-                                   const cudaDeviceProp &deviceProp)
+static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
  {
      if (!is_gmx_supported_gpu(deviceProp))
      {
@@ -345,23 +338,27 @@ static int is_gmx_supported_gpu_id(int                   deviceId,
      const int checkResult = do_sanity_checks(deviceId, deviceProp);
      switch (checkResult)
      {
-        case  0: return egpuCompatible;
+        case 0: return egpuCompatible;
          case -1: return egpuInsane;
          case -2: return egpuUnavailable;
-        default: GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
+        default:
+            GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
              return egpuCompatible;
      }
  }
  
-bool canDetectGpus(std::string *errorMessage)
+bool isGpuDetectionFunctional(std::string* errorMessage)
  {
-    cudaError_t        stat;
-    int                driverVersion = -1;
-    stat = cudaDriverGetVersion(&driverVersion);
-    GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+    cudaError_t stat;
+    int         driverVersion = -1;
+    stat                      = cudaDriverGetVersion(&driverVersion);
+    GMX_ASSERT(stat != cudaErrorInvalidValue,
+               "An impossible null pointer was passed to cudaDriverGetVersion");
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
+                              cudaGetErrorName(stat), cudaGetErrorString(stat))
+                    .c_str());
      bool foundDriver = (driverVersion > 0);
      if (!foundDriver)
      {
@@ -405,7 +402,7 @@ bool canDetectGpus(std::string *errorMessage)
      return true;
  }
  
-void findGpus(gmx_gpu_info_t *gpu_info)
+void findGpus(gmx_gpu_info_t* gpu_info)
  {
      assert(gpu_info);
  
@@ -415,14 +412,15 @@ void findGpus(gmx_gpu_info_t *gpu_info)
      cudaError_t stat = cudaGetDeviceCount(&ndev);
      if (stat != cudaSuccess)
      {
-        GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
-                                     "canDetectGpus() was not called appropriately beforehand."));
+        GMX_THROW(gmx::InternalError(
+                "Invalid call of findGpus() when CUDA API returned an error, perhaps "
+                "canDetectGpus() was not called appropriately beforehand."));
      }
  
      // We expect to start device support/sanity checks with a clean runtime error state
      gmx::ensureNoPendingCudaError("");
  
-    gmx_device_info_t *devs;
+    gmx_device_info_t* devs;
      snew(devs, ndev);
      for (int i = 0; i < ndev; i++)
      {
@@ -471,14 +469,16 @@ void findGpus(gmx_gpu_info_t *gpu_info)
  
      stat = cudaPeekAtLastError();
      GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("We promise to return with clean CUDA state, but non-success state encountered: %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+                       gmx::formatString("We promise to return with clean CUDA state, but "
+                                         "non-success state encountered: %s: %s",
+                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
+                               .c_str());
  
      gpu_info->n_dev   = ndev;
      gpu_info->gpu_dev = devs;
  }
  
-void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int index)
+void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
  {
      assert(s);
  
@@ -487,24 +487,19 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
          return;
      }
  
-    gmx_device_info_t *dinfo = &gpu_info.gpu_dev[index];
+    gmx_device_info_t* dinfo = &gpu_info.gpu_dev[index];
  
-    bool               bGpuExists = (dinfo->stat != egpuNonexistent &&
-                                     dinfo->stat != egpuInsane);
+    bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
  
      if (!bGpuExists)
      {
-        sprintf(s, "#%d: %s, stat: %s",
-                dinfo->id, "N/A",
-                gpu_detect_res_str[dinfo->stat]);
+        sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
      }
      else
      {
-        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
-                dinfo->id, dinfo->prop.name,
-                dinfo->prop.major, dinfo->prop.minor,
-                dinfo->prop.ECCEnabled ? "yes" : " no",
-                gpu_detect_res_str[dinfo->stat]);
+        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
+                dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
+                dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
      }
  }
  
@@ -521,22 +516,6 @@ size_t sizeof_gpu_dev_info(void)
      return sizeof(gmx_device_info_t);
  }
  
-void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
-                                  gmx_host_alloc_t **nb_alloc,
-                                  gmx_host_free_t  **nb_free)
-{
-    if (bUseGpuKernels)
-    {
-        *nb_alloc = &pmalloc;
-        *nb_free  = &pfree;
-    }
-    else
-    {
-        *nb_alloc = nullptr;
-        *nb_free  = nullptr;
-    }
-}
-
  void startGpuProfiler(void)
  {
      /* The NVPROF_ID environment variable is set by nvprof and indicates that
@@ -580,7 +559,99 @@ void resetGpuProfiler(void)
      }
  }
  
-int gpu_info_get_stat(const gmx_gpu_info_t &info, int index)
+int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
  {
      return info.gpu_dev[index].stat;
  }
+
+/*! \brief Check status returned from peer access CUDA call, and error out or warn appropriately
+ * \param[in] stat           CUDA call return status
+ * \param[in] gpuA           ID for GPU initiating peer access call
+ * \param[in] gpuB           ID for remote GPU
+ * \param[in] mdlog          Logger object
+ * \param[in] cudaCallName   name of CUDA peer access call
+ */
+static void peerAccessCheckStat(const cudaError_t    stat,
+                                const int            gpuA,
+                                const int            gpuB,
+                                const gmx::MDLogger& mdlog,
+                                const char*          cudaCallName)
+{
+    if ((stat == cudaErrorInvalidDevice) || (stat == cudaErrorInvalidValue))
+    {
+        std::string errorString =
+                gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
+        CU_RET_ERR(stat, errorString.c_str());
+    }
+    if (stat != cudaSuccess)
+    {
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "GPU peer access not enabled between GPUs %d and %d due to unexpected "
+                        "return value from %s: %s",
+                        gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
+    }
+}
+
+void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MDLogger& mdlog)
+{
+    cudaError_t stat;
+
+    // take a note of currently-set GPU
+    int currentGpu;
+    stat = cudaGetDevice(&currentGpu);
+    CU_RET_ERR(stat, "cudaGetDevice in setupGpuDevicePeerAccess failed");
+
+    std::string message = gmx::formatString(
+            "Note: Peer access enabled between the following GPU pairs in the node:\n ");
+    bool peerAccessEnabled = false;
+
+    for (unsigned int i = 0; i < gpuIdsToUse.size(); i++)
+    {
+        int gpuA = gpuIdsToUse[i];
+        stat     = cudaSetDevice(gpuA);
+        if (stat != cudaSuccess)
+        {
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "GPU peer access not enabled due to unexpected return value from "
+                            "cudaSetDevice(%d): %s",
+                            gpuA, cudaGetErrorString(stat));
+            return;
+        }
+        for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
+        {
+            if (j != i)
+            {
+                int gpuB          = gpuIdsToUse[j];
+                int canAccessPeer = 0;
+                stat              = cudaDeviceCanAccessPeer(&canAccessPeer, gpuA, gpuB);
+                peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceCanAccessPeer");
+
+                if (canAccessPeer)
+                {
+                    stat = cudaDeviceEnablePeerAccess(gpuB, 0);
+                    peerAccessCheckStat(stat, gpuA, gpuB, mdlog, "cudaDeviceEnablePeerAccess");
+
+                    message           = gmx::formatString("%s%d->%d ", message.c_str(), gpuA, gpuB);
+                    peerAccessEnabled = true;
+                }
+            }
+        }
+    }
+
+    // re-set GPU to that originally set
+    stat = cudaSetDevice(currentGpu);
+    if (stat != cudaSuccess)
+    {
+        CU_RET_ERR(stat, "cudaSetDevice in setupGpuDevicePeerAccess failed");
+        return;
+    }
+
+    if (peerAccessEnabled)
+    {
+        GMX_LOG(mdlog.info).asParagraph().appendTextFormatted("%s", message.c_str());
+    }
+}