corrected dynamic load balancing when sharing GPUs
authorBerk Hess <hess@kth.se>
Fri, 27 Sep 2013 12:24:58 +0000 (14:24 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 7 Nov 2013 22:44:26 +0000 (23:44 +0100)
When sharing GPUs over MPI ranks, the time the GPU is busy might not
reflect the actual load. To make the dynamic load balancing between
domains work correctly, the GPU wait times are now redistributed over
the ranks/domains sharing a GPU.

Change-Id: Id9414e3ef7cc5a73a2b4560a0e10c2ee8ab1257f

include/domdec.h
src/kernel/runner.c
src/mdlib/domdec.c
src/mdlib/sim_util.c

index 19a3a78e44baa17c9d0da7f4101e17526f3660f0..e94139b3a6b0b8523b0164e3d35726668e189387 100644 (file)
@@ -148,6 +148,16 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
  * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
  */
 
+GMX_LIBMD_EXPORT
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt);
+/* When domains (PP MPI ranks) share a GPU, the individual GPU wait times
+ * are meaningless, as it depends on the order in which tasks on the same
+ * GPU finish. Therefore there wait times need to be averaged over the ranks
+ * sharing the same GPU. This function sets up the communication for that.
+ */
+
 GMX_LIBMD_EXPORT
 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd);
 
@@ -159,7 +169,7 @@ void dd_collect_state(gmx_domdec_t *dd,
                       t_state *state_local, t_state *state);
 
 enum {
-    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr
+    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclWaitGPU, ddCyclPME, ddCyclNr
 };
 
 GMX_LIBMD_EXPORT
index 42ef770eda2eb4d88ce4ce3f2c2a97fb661e98c2..17b5f351ad044b5756ad317285078c0d1c55671c 100644 (file)
@@ -1485,6 +1485,12 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     /* check consistency of CPU acceleration and number of GPUs selected */
     gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
 
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
     /* getting number of PP/PME threads
        PME: env variable should be read only on one node to make sure it is
        identical everywhere;
index d78ed38a1d97d86ceadb0bea7a994b6b4255290c..d488b0bdf16840133bdf9ddecf4d28a6a9352969 100644 (file)
@@ -73,6 +73,7 @@
 #include "nbnxn_search.h"
 #include "bondf.h"
 #include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
 
 #ifdef GMX_LIB_MPI
 #include <mpi.h>
@@ -368,8 +369,10 @@ typedef struct gmx_domdec_comm
     /* Stuff for load communication */
     gmx_bool           bRecordLoad;
     gmx_domdec_load_t *load;
+    int                nrank_gpu_shared;
 #ifdef GMX_MPI
     MPI_Comm          *mpi_comm_load;
+    MPI_Comm           mpi_comm_gpu_shared;
 #endif
 
     /* Maximum DLB scaling per load balancing step in percent */
@@ -2897,12 +2900,40 @@ static float dd_force_load(gmx_domdec_comm_t *comm)
         if (comm->cycl_n[ddCyclF] > 1)
         {
             /* Subtract the maximum of the last n cycle counts
-             * to get rid of possible high counts due to other soures,
+             * to get rid of possible high counts due to other sources,
              * for instance system activity, that would otherwise
              * affect the dynamic load balancing.
              */
             load -= comm->cycl_max[ddCyclF];
         }
+
+#ifdef GMX_MPI
+        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+        {
+            float gpu_wait, gpu_wait_sum;
+
+            gpu_wait = comm->cycl[ddCyclWaitGPU];
+            if (comm->cycl_n[ddCyclF] > 1)
+            {
+                /* We should remove the WaitGPU time of the same MD step
+                 * as the one with the maximum F time, since the F time
+                 * and the wait time are not independent.
+                 * Furthermore, the step for the max F time should be chosen
+                 * the same on all ranks that share the same GPU.
+                 * But to keep the code simple, we remove the average instead.
+                 * The main reason for artificially long times at some steps
+                 * is spurious CPU activity or MPI time, so we don't expect
+                 * that changes in the GPU wait time matter a lot here.
+                 */
+                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+            }
+            /* Sum the wait times over the ranks that share the same GPU */
+            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+                          comm->mpi_comm_gpu_shared);
+            /* Replace the wait time by the average over the ranks */
+            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+        }
+#endif
     }
 
     return load;
@@ -5645,6 +5676,62 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 }
 #endif
 
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+    int           physicalnode_id_hash;
+    int           gpu_id;
+    gmx_domdec_t *dd;
+    MPI_Comm      mpi_comm_pp_physicalnode;
+
+    if (!(cr->duty & DUTY_PP) ||
+        hw_opt->gpu_opt.ncuda_dev_use == 0)
+    {
+        /* Only PP nodes (currently) use GPUs.
+         * If we don't have GPUs, there are no resources to share.
+         */
+        return;
+    }
+
+    physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+    dd = cr->dd;
+
+    if (debug)
+    {
+        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+                dd->rank, physicalnode_id_hash, gpu_id);
+    }
+    /* Split the PP communicator over the physical nodes */
+    /* TODO: See if we should store this (before), as it's also used for
+     * for the nodecomm summution.
+     */
+    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+                   &mpi_comm_pp_physicalnode);
+    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+                   &dd->comm->mpi_comm_gpu_shared);
+    MPI_Comm_free(&mpi_comm_pp_physicalnode);
+    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+    if (debug)
+    {
+        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+    }
+
+    /* Note that some ranks could share a GPU, while others don't */
+
+    if (dd->comm->nrank_gpu_shared == 1)
+    {
+        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+    }
+#endif
+}
+
 static void make_load_communicators(gmx_domdec_t *dd)
 {
 #ifdef GMX_MPI
@@ -6616,6 +6703,9 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 
     }
 
+    /* Initialize to GPU share count to 0, might change later */
+    comm->nrank_gpu_shared = 0;
+
     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 
     comm->bDynLoadBal = (comm->eDLB == edlbYES);
index fa04b6fc97b6e2d371a2ff111d7cee89c31314db..a6cafa8304657dd1582200325dd9e67f7d2cc986 100644 (file)
@@ -766,12 +766,13 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     matrix              boxs;
     rvec                vzero, box_diag;
     real                e, v, dvdl;
-    float               cycles_pme, cycles_force;
+    float               cycles_pme, cycles_force, cycles_wait_gpu;
     nonbonded_verlet_t *nbv;
 
-    cycles_force   = 0;
-    nbv            = fr->nbv;
-    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+    cycles_force    = 0;
+    cycles_wait_gpu = 0;
+    nbv             = fr->nbv;
+    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
 
     start  = mdatoms->start;
     homenr = mdatoms->homenr;
@@ -1284,13 +1285,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         {
             if (bUseGPU)
             {
+                float cycles_tmp;
+
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                 nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                     nbv->grp[eintNonlocal].nbat,
                                     flags, eatNonlocal,
                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                     fr->fshift);
-                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += cycles_tmp;
+                cycles_force    += cycles_tmp;
             }
             else
             {
@@ -1356,7 +1361,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                                 flags, eatLocal,
                                 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                 fr->fshift);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
             /* now clear the GPU outputs while we finish the step on the CPU */
 
@@ -1390,6 +1395,10 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         if (wcycle)
         {
             dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+            if (bUseGPU)
+            {
+                dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+            }
         }
     }