corrected dynamic load balancing when sharing GPUs

author Berk Hess <hess@kth.se>

Fri, 27 Sep 2013 12:24:58 +0000 (14:24 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Thu, 7 Nov 2013 22:44:26 +0000 (23:44 +0100)
author Berk Hess <hess@kth.se>
Fri, 27 Sep 2013 12:24:58 +0000 (14:24 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 7 Nov 2013 22:44:26 +0000 (23:44 +0100)
diff --git a/include/domdec.h b/include/domdec.h

index 19a3a78e44baa17c9d0da7f4101e17526f3660f0..e94139b3a6b0b8523b0164e3d35726668e189387 100644 (file)
--- a/include/domdec.h
+++ b/include/domdec.h
@@ -148,6 +148,16 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
   * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
   */
  
+GMX_LIBMD_EXPORT
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt);
+/* When domains (PP MPI ranks) share a GPU, the individual GPU wait times
+ * are meaningless, as it depends on the order in which tasks on the same
+ * GPU finish. Therefore there wait times need to be averaged over the ranks
+ * sharing the same GPU. This function sets up the communication for that.
+ */
+
  GMX_LIBMD_EXPORT
  void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd);
  
@@ -159,7 +169,7 @@ void dd_collect_state(gmx_domdec_t *dd,
                        t_state *state_local, t_state *state);
  
  enum {
-    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr
+    ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclWaitGPU, ddCyclPME, ddCyclNr
  };
  
  GMX_LIBMD_EXPORT
diff --git a/src/kernel/runner.c b/src/kernel/runner.c

index 42ef770eda2eb4d88ce4ce3f2c2a97fb661e98c2..17b5f351ad044b5756ad317285078c0d1c55671c 100644 (file)
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -1485,6 +1485,12 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
      /* check consistency of CPU acceleration and number of GPUs selected */
      gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
  
+    if (DOMAINDECOMP(cr))
+    {
+        /* When we share GPUs over ranks, we need to know this for the DLB */
+        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
+    }
+
      /* getting number of PP/PME threads
         PME: env variable should be read only on one node to make sure it is
         identical everywhere;
diff --git a/src/mdlib/domdec.c b/src/mdlib/domdec.c

index d78ed38a1d97d86ceadb0bea7a994b6b4255290c..d488b0bdf16840133bdf9ddecf4d28a6a9352969 100644 (file)
--- a/src/mdlib/domdec.c
+++ b/src/mdlib/domdec.c
@@ -73,6 +73,7 @@
  #include "nbnxn_search.h"
  #include "bondf.h"
  #include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
  
  #ifdef GMX_LIB_MPI
  #include <mpi.h>
@@ -368,8 +369,10 @@ typedef struct gmx_domdec_comm
      /* Stuff for load communication */
      gmx_bool           bRecordLoad;
      gmx_domdec_load_t *load;
+    int                nrank_gpu_shared;
  #ifdef GMX_MPI
      MPI_Comm          *mpi_comm_load;
+    MPI_Comm           mpi_comm_gpu_shared;
  #endif
  
      /* Maximum DLB scaling per load balancing step in percent */
@@ -2897,12 +2900,40 @@ static float dd_force_load(gmx_domdec_comm_t *comm)
          if (comm->cycl_n[ddCyclF] > 1)
          {
              /* Subtract the maximum of the last n cycle counts
-             * to get rid of possible high counts due to other soures,
+             * to get rid of possible high counts due to other sources,
               * for instance system activity, that would otherwise
               * affect the dynamic load balancing.
               */
              load -= comm->cycl_max[ddCyclF];
          }
+
+#ifdef GMX_MPI
+        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+        {
+            float gpu_wait, gpu_wait_sum;
+
+            gpu_wait = comm->cycl[ddCyclWaitGPU];
+            if (comm->cycl_n[ddCyclF] > 1)
+            {
+                /* We should remove the WaitGPU time of the same MD step
+                 * as the one with the maximum F time, since the F time
+                 * and the wait time are not independent.
+                 * Furthermore, the step for the max F time should be chosen
+                 * the same on all ranks that share the same GPU.
+                 * But to keep the code simple, we remove the average instead.
+                 * The main reason for artificially long times at some steps
+                 * is spurious CPU activity or MPI time, so we don't expect
+                 * that changes in the GPU wait time matter a lot here.
+                 */
+                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+            }
+            /* Sum the wait times over the ranks that share the same GPU */
+            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+                          comm->mpi_comm_gpu_shared);
+            /* Replace the wait time by the average over the ranks */
+            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+        }
+#endif
      }
  
      return load;
@@ -5645,6 +5676,62 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
  }
  #endif
  
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+                                   const gmx_hw_info_t *hwinfo,
+                                   const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+    int           physicalnode_id_hash;
+    int           gpu_id;
+    gmx_domdec_t *dd;
+    MPI_Comm      mpi_comm_pp_physicalnode;
+
+    if (!(cr->duty & DUTY_PP) ||
+        hw_opt->gpu_opt.ncuda_dev_use == 0)
+    {
+        /* Only PP nodes (currently) use GPUs.
+         * If we don't have GPUs, there are no resources to share.
+         */
+        return;
+    }
+
+    physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+    dd = cr->dd;
+
+    if (debug)
+    {
+        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+                dd->rank, physicalnode_id_hash, gpu_id);
+    }
+    /* Split the PP communicator over the physical nodes */
+    /* TODO: See if we should store this (before), as it's also used for
+     * for the nodecomm summution.
+     */
+    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+                   &mpi_comm_pp_physicalnode);
+    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+                   &dd->comm->mpi_comm_gpu_shared);
+    MPI_Comm_free(&mpi_comm_pp_physicalnode);
+    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+    if (debug)
+    {
+        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+    }
+
+    /* Note that some ranks could share a GPU, while others don't */
+
+    if (dd->comm->nrank_gpu_shared == 1)
+    {
+        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+    }
+#endif
+}
+
  static void make_load_communicators(gmx_domdec_t *dd)
  {
  #ifdef GMX_MPI
@@ -6616,6 +6703,9 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
  
      }
  
+    /* Initialize to GPU share count to 0, might change later */
+    comm->nrank_gpu_shared = 0;
+
      comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
  
      comm->bDynLoadBal = (comm->eDLB == edlbYES);
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c

index fa04b6fc97b6e2d371a2ff111d7cee89c31314db..a6cafa8304657dd1582200325dd9e67f7d2cc986 100644 (file)
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -766,12 +766,13 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      matrix              boxs;
      rvec                vzero, box_diag;
      real                e, v, dvdl;
-    float               cycles_pme, cycles_force;
+    float               cycles_pme, cycles_force, cycles_wait_gpu;
      nonbonded_verlet_t *nbv;
  
-    cycles_force   = 0;
-    nbv            = fr->nbv;
-    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+    cycles_force    = 0;
+    cycles_wait_gpu = 0;
+    nbv             = fr->nbv;
+    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
  
      start  = mdatoms->start;
      homenr = mdatoms->homenr;
@@ -1284,13 +1285,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          {
              if (bUseGPU)
              {
+                float cycles_tmp;
+
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                  nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                      nbv->grp[eintNonlocal].nbat,
                                      flags, eatNonlocal,
                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                      fr->fshift);
-                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += cycles_tmp;
+                cycles_force    += cycles_tmp;
              }
              else
              {
@@ -1356,7 +1361,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
@@ -1390,6 +1395,10 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (wcycle)
          {
              dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+            if (bUseGPU)
+            {
+                dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+            }
          }
      }
author	Berk Hess <hess@kth.se>
	Fri, 27 Sep 2013 12:24:58 +0000 (14:24 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Thu, 7 Nov 2013 22:44:26 +0000 (23:44 +0100)
include/domdec.h		patch \| blob \| history
src/kernel/runner.c		patch \| blob \| history
src/mdlib/domdec.c		patch \| blob \| history
src/mdlib/sim_util.c		patch \| blob \| history