From 91118c53d20435528da7d95942ef745a1363266b Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Thu, 7 May 2015 22:28:06 +0200
Subject: [PATCH] Automated the -gpu_id option

With #PPrank <= #gpu in a node, the GPU id's were already assigned
automatically. Now with #PPrank a multiple of #GPU they are assigned
automatically with GPU sharing.

Change-Id: I59079b542b5703553c3e1b841c47abdc65f64459
---
 src/gromacs/gmxlib/gmx_detect_hardware.cpp | 63 +++++++++++++++++-----
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
index a851dbee5d..709dc0c037 100644
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -94,7 +94,7 @@ static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
 
 
 /* FW decl. */
-static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int count);
+static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank);
 static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
                                     const gmx_gpu_opt_t  *gpu_opt);
 
@@ -1081,6 +1081,12 @@ void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
         gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
     }
 
+    if (!(cr->duty & DUTY_PP))
+    {
+        /* Our rank is not doing PP, we don't use a GPU */
+        return;
+    }
+
     if (gpu_opt->bUserSet)
     {
         /* Check the GPU IDs passed by the user.
@@ -1116,7 +1122,7 @@ void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
     else
     {
         pick_compatible_gpus(&hwinfo_g->gpu_info, gpu_opt);
-        limit_num_gpus_used(gpu_opt, cr->nrank_pp_intranode);
+        set_gpu_ids(gpu_opt, cr->nrank_pp_intranode, cr->rank_pp_intranode);
     }
 
     /* If the user asked for a GPU, check whether we have a GPU */
@@ -1126,25 +1132,58 @@ void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
     }
 }
 
-/* If we detected more compatible GPUs than we can use, limit the
- * number. We print detailed messages about this later in
- * gmx_check_hw_runconf_consistency.
+/* Select the GPUs we will use. This is an operation local to each physical
+ * node. If we have less MPI ranks than GPUs, we will waste some GPUs.
+ * nrank and rank are the rank count and id for PP processes in our node.
  */
-static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int maxNumberToUse)
+static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
 {
     GMX_RELEASE_ASSERT(gpu_opt, "Invalid gpu_opt pointer passed");
-    GMX_RELEASE_ASSERT(maxNumberToUse >= 1,
+    GMX_RELEASE_ASSERT(nrank >= 1,
                        gmx::formatString("Invalid limit (%d) for the number of GPUs (detected %d compatible GPUs)",
-                                         maxNumberToUse, gpu_opt->n_dev_compatible).c_str());
+                                         rank, gpu_opt->n_dev_compatible).c_str());
+
+    if (gpu_opt->n_dev_compatible == 0)
+    {
+        char host[255];
+
+        gmx_gethostname(host, 255);
+        gmx_fatal(FARGS, "A GPU was requested on host %s, but no compatible GPUs were detected. All nodes with PP ranks need to have GPUs. If you intended to use GPU acceleration in a parallel run, you can either avoid using the nodes that don't have GPUs or place PME ranks on these nodes.", host);
+    }
+
+    int nshare;
+
+    nshare = 1;
+    if (nrank > gpu_opt->n_dev_compatible)
+    {
+        if (nrank % gpu_opt->n_dev_compatible == 0)
+        {
+            nshare = nrank/gpu_opt->n_dev_compatible;
+        }
+        else
+        {
+            if (rank == 0)
+            {
+                gmx_fatal(FARGS, "The number of MPI ranks (%d) in a physical node is not a multiple of the number of GPUs (%d). Select a different number of MPI ranks or use the -gpu_id option to manually specify the GPU to be used.",
+                          nrank, gpu_opt->n_dev_compatible);
+            }
+
+#ifdef GMX_MPI
+            /* We use a global barrier to prevent ranks from continuing with
+             * an invalid setup.
+             */
+            MPI_Barrier(MPI_COMM_WORLD);
+#endif
+        }
+    }
 
-    /* Don't increase the number of GPUs used beyond (e.g.) the number
-       of PP ranks */
-    gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible, maxNumberToUse);
+    /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
+    gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
     snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
     for (int i = 0; i != gpu_opt->n_dev_use; ++i)
     {
         /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
-        gpu_opt->dev_use[i] = gpu_opt->dev_compatible[i];
+        gpu_opt->dev_use[i] = gpu_opt->dev_compatible[i/nshare];
     }
 }
 
-- 
2.22.0