Fix multiple MPI ranks per node with OpenCL

author Szilárd Páll <pall.szilard@gmail.com>

Sun, 3 Apr 2016 23:39:48 +0000 (01:39 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 27 Apr 2016 15:46:23 +0000 (17:46 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Sun, 3 Apr 2016 23:39:48 +0000 (01:39 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 27 Apr 2016 15:46:23 +0000 (17:46 +0200)
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst

index 9bbd4d2c62ae591076e109253106bad54214f146..c1062e34c269e6a92e5da801f509a5762b8149a5 100644 (file)
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -553,7 +553,6 @@ Known limitations of the OpenCL support
  
  Limitations in the current OpenCL support of interest to |Gromacs| users:
  
-- Using more than one GPU on a node is supported only with thread MPI
  - No Intel devices (CPUs, GPUs or Xeon Phi) are supported
  - Due to blocking behavior of some asynchronous task enqueuing functions
    in the NVIDIA OpenCL runtime, with the affected driver versions there is
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index 91715e6fd42762dc5c85e8902940d6257b4a6e0d..b3fdde15e3b3a32f7dae3d0e350b31d16f5fe0da 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -92,13 +92,10 @@ static const bool bGPUBinary = GMX_GPU != GMX_GPU_NONE;
  static const bool gpuSharingSupport[] = { false, true, true };
  static const bool bGpuSharingSupported = gpuSharingSupport[GMX_GPU];
  
-/* CUDA supports everything. Our current OpenCL implementation seems
- * to handle concurrency correctly with thread-MPI. The AMD OpenCL
- * runtime does not seem to support creating a context from more than
- * one real MPI rank on the same node (it segfaults when you try).
+/* Both CUDA and OpenCL (on the tested/supported platforms) supports everything.
   */
  static const bool multiGpuSupport[] = {
-    false, true, GMX_THREAD_MPI
+    false, true, true
  };
  static const bool bMultiGpuPerNodeSupported = multiGpuSupport[GMX_GPU];
  
@@ -708,6 +705,10 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
       * the detection only on one MPI rank per node and broadcast the info.
       * Note that with thread-MPI only a single thread runs this code.
       *
+     * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
+     * ID stored in the structure are unique for each rank (even if a device
+     * is shared by multiple ranks).
+     *
       * TODO: We should also do CPU hardware detection only once on each
       * physical node and broadcast it, instead of do it on every MPI rank.
       */
@@ -726,7 +727,11 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
      rank_local = 0;
  #endif
  
-    if (rank_local == 0)
+    /*  With CUDA detect only on one rank per host, with OpenCL need do
+     *  the detection on all PP ranks */
+    bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && (cr->duty & DUTY_PP));
+
+    if (rank_local == 0 || isOpenclPpRank)
      {
          char detection_error[STRLEN] = "", sbuf[STRLEN];
  
@@ -748,24 +753,27 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
      }
  
  #if GMX_LIB_MPI
-    /* Broadcast the GPU info to the other ranks within this node */
-    MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
-
-    if (hwinfo_g->gpu_info.n_dev > 0)
+    if (!isOpenclPpRank)
      {
-        int dev_size;
-
-        dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
+        /* Broadcast the GPU info to the other ranks within this node */
+        MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
  
-        if (rank_local > 0)
+        if (hwinfo_g->gpu_info.n_dev > 0)
          {
-            hwinfo_g->gpu_info.gpu_dev =
-                (struct gmx_device_info_t *)malloc(dev_size);
+            int dev_size;
+
+            dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
+
+            if (rank_local > 0)
+            {
+                hwinfo_g->gpu_info.gpu_dev =
+                    (struct gmx_device_info_t *)malloc(dev_size);
+            }
+            MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
+                      0, physicalnode_comm);
+            MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
+                      0, physicalnode_comm);
          }
-        MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
-                  0, physicalnode_comm);
-        MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
-                  0, physicalnode_comm);
      }
  
      MPI_Comm_free(&physicalnode_comm);
author	Szilárd Páll <pall.szilard@gmail.com>
	Sun, 3 Apr 2016 23:39:48 +0000 (01:39 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 27 Apr 2016 15:46:23 +0000 (17:46 +0200)
docs/user-guide/mdrun-performance.rst		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history