From: Berk Hess <hess@kth.se>
Date: Fri, 27 Sep 2013 12:24:58 +0000 (+0200)
Subject: corrected dynamic load balancing when sharing GPUs
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ba8232e965652669cc0b558a273f81a4d9733d25;p=alexxy%2Fgromacs.git

corrected dynamic load balancing when sharing GPUs

When sharing GPUs over MPI ranks, the time the GPU is busy might not
reflect the actual load. To make the dynamic load balancing between
domains work correctly, the GPU wait times are now redistributed over
the ranks/domains sharing a GPU.

Change-Id: Id9414e3ef7cc5a73a2b4560a0e10c2ee8ab1257f
---

diff --git a/include/domdec.h b/include/domdec.h
index 19a3a78e44..e94139b3a6 100644
--- a/include/domdec.h
+++ b/include/domdec.h
@@ -148,6 +148,16 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
  * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
  */
 
+GMX_LIBMD_EXPORT
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt);
+/* When domains (PP MPI ranks) share a GPU, the individual GPU wait times
+ * are meaningless, as it depends on the order in which tasks on the same
+ * GPU finish. Therefore there wait times need to be averaged over the ranks
+ * sharing the same GPU. This function sets up the communication for that.
+ */
+
 GMX_LIBMD_EXPORT
 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd);
 
@@ -159,7 +169,7 @@ void dd_collect_state(gmx_domdec_t *dd,
                       t_state *state_local, t_state *state);
 
 enum {
-    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr
+    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclWaitGPU, ddCyclPME, ddCyclNr
 };
 
 GMX_LIBMD_EXPORT
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index 42ef770eda..17b5f351ad 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -1485,6 +1485,12 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     /* check consistency of CPU acceleration and number of GPUs selected */
     gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
 
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
     /* getting number of PP/PME threads
        PME: env variable should be read only on one node to make sure it is
        identical everywhere;
diff --git a/src/mdlib/domdec.c b/src/mdlib/domdec.c
index d78ed38a1d..d488b0bdf1 100644
--- a/src/mdlib/domdec.c
+++ b/src/mdlib/domdec.c
@@ -73,6 +73,7 @@
 #include "nbnxn_search.h"
 #include "bondf.h"
 #include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
 
 #ifdef GMX_LIB_MPI
 #include <mpi.h>
@@ -368,8 +369,10 @@ typedef struct gmx_domdec_comm
     /* Stuff for load communication */
     gmx_bool           bRecordLoad;
     gmx_domdec_load_t *load;
+    int                nrank_gpu_shared;
 #ifdef GMX_MPI
     MPI_Comm          *mpi_comm_load;
+    MPI_Comm           mpi_comm_gpu_shared;
 #endif
 
     /* Maximum DLB scaling per load balancing step in percent */
@@ -2897,12 +2900,40 @@ static float dd_force_load(gmx_domdec_comm_t *comm)
         if (comm->cycl_n[ddCyclF] > 1)
         {
             /* Subtract the maximum of the last n cycle counts
-             * to get rid of possible high counts due to other soures,
+             * to get rid of possible high counts due to other sources,
              * for instance system activity, that would otherwise
              * affect the dynamic load balancing.
              */
             load -= comm->cycl_max[ddCyclF];
         }
+
+#ifdef GMX_MPI
+        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+        {
+            float gpu_wait, gpu_wait_sum;
+
+            gpu_wait = comm->cycl[ddCyclWaitGPU];
+            if (comm->cycl_n[ddCyclF] > 1)
+            {
+                /* We should remove the WaitGPU time of the same MD step
+                 * as the one with the maximum F time, since the F time
+                 * and the wait time are not independent.
+                 * Furthermore, the step for the max F time should be chosen
+                 * the same on all ranks that share the same GPU.
+                 * But to keep the code simple, we remove the average instead.
+                 * The main reason for artificially long times at some steps
+                 * is spurious CPU activity or MPI time, so we don't expect
+                 * that changes in the GPU wait time matter a lot here.
+                 */
+                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+            }
+            /* Sum the wait times over the ranks that share the same GPU */
+            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+                          comm->mpi_comm_gpu_shared);
+            /* Replace the wait time by the average over the ranks */
+            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+        }
+#endif
     }
 
     return load;
@@ -5645,6 +5676,62 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 }
 #endif
 
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+    int           physicalnode_id_hash;
+    int           gpu_id;
+    gmx_domdec_t *dd;
+    MPI_Comm      mpi_comm_pp_physicalnode;
+
+    if (!(cr->duty & DUTY_PP) ||
+        hw_opt->gpu_opt.ncuda_dev_use == 0)
+    {
+        /* Only PP nodes (currently) use GPUs.
+         * If we don't have GPUs, there are no resources to share.
+         */
+        return;
+    }
+
+    physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+    dd = cr->dd;
+
+    if (debug)
+    {
+        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+                dd->rank, physicalnode_id_hash, gpu_id);
+    }
+    /* Split the PP communicator over the physical nodes */
+    /* TODO: See if we should store this (before), as it's also used for
+     * for the nodecomm summution.
+     */
+    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+                   &mpi_comm_pp_physicalnode);
+    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+                   &dd->comm->mpi_comm_gpu_shared);
+    MPI_Comm_free(&mpi_comm_pp_physicalnode);
+    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+    if (debug)
+    {
+        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+    }
+
+    /* Note that some ranks could share a GPU, while others don't */
+
+    if (dd->comm->nrank_gpu_shared == 1)
+    {
+        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+    }
+#endif
+}
+
 static void make_load_communicators(gmx_domdec_t *dd)
 {
 #ifdef GMX_MPI
@@ -6616,6 +6703,9 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 
     }
 
+    /* Initialize to GPU share count to 0, might change later */
+    comm->nrank_gpu_shared = 0;
+
     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 
     comm->bDynLoadBal = (comm->eDLB == edlbYES);
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c
index fa04b6fc97..a6cafa8304 100644
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -766,12 +766,13 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     matrix              boxs;
     rvec                vzero, box_diag;
     real                e, v, dvdl;
-    float               cycles_pme, cycles_force;
+    float               cycles_pme, cycles_force, cycles_wait_gpu;
     nonbonded_verlet_t *nbv;
 
-    cycles_force   = 0;
-    nbv            = fr->nbv;
-    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+    cycles_force    = 0;
+    cycles_wait_gpu = 0;
+    nbv             = fr->nbv;
+    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
 
     start  = mdatoms->start;
     homenr = mdatoms->homenr;
@@ -1284,13 +1285,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         {
             if (bUseGPU)
             {
+                float cycles_tmp;
+
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                     nbv->grp[eintNonlocal].nbat,
                                     flags, eatNonlocal,
                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                     fr->fshift);
-                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += cycles_tmp;
+                cycles_force    += cycles_tmp;
             }
             else
             {
@@ -1356,7 +1361,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                                 flags, eatLocal,
                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                 fr->fshift);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
             /* now clear the GPU outputs while we finish the step on the CPU */
 
@@ -1390,6 +1395,10 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         if (wcycle)
         {
             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+            if (bUseGPU)
+            {
+                dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+            }
         }
     }