Added GPU local wait to load balancing

author Berk Hess <hess@kth.se>

Thu, 11 Sep 2014 13:02:14 +0000 (15:02 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Tue, 30 Sep 2014 11:10:22 +0000 (13:10 +0200)
author Berk Hess <hess@kth.se>
Thu, 11 Sep 2014 13:02:14 +0000 (15:02 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 30 Sep 2014 11:10:22 +0000 (13:10 +0200)
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c

index 0c2b89605bfe99f022e755efd260b86b2437e0ae..a6b166ccfd5acedb51742f9063d40207edda4e23 100644 (file)
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -1413,6 +1413,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
+        if (bUseGPU)
+        {
+            /* We are done with the CPU compute, but the GPU local non-bonded
+             * kernel can still be running while we communicate the forces.
+             * We start a counter here, so we can, hopefully, time the rest
+             * of the GPU kernel execution and data transfer.
+             */
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+        }
+
          /* Communicate the forces */
          wallcycle_start(wcycle, ewcMOVEF);
          dd_move_f(cr->dd, f, fr->fshift);
@@ -1443,13 +1453,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+            float       cycles_tmp, cycles_wait_est;
+            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                  nbv->grp[eintLocal].nbat,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+            if (bDoForces && DOMAINDECOMP(cr))
+            {
+                cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+                if (cycles_tmp < cuda_api_overhead_margin)
+                {
+                    /* We measured few cycles, it could be that the kernel
+                     * and transfer finished earlier and there was no actual
+                     * wait time, only API call overhead.
+                     * Then the actual time could be anywhere between 0 and
+                     * cycles_wait_est. As a compromise, we use half the time.
+                     */
+                    cycles_wait_est *= 0.5f;
+                }
+            }
+            else
+            {
+                /* No force communication so we actually timed the wait */
+                cycles_wait_est = cycles_tmp;
+            }
+            /* Even though this is after dd_move_f, the actual task we are
+             * waiting for runs asynchronously with dd_move_f and we usually
+             * have nothing to balance it with, so we can and should add
+             * the time to the force time for load balancing.
+             */
+            cycles_force    += cycles_wait_est;
+            cycles_wait_gpu += cycles_wait_est;
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c

index 98c31b55b1777642edf9becbe258067cbc487017..ae26c2cec4c0492ba6cdf92071587029d90d4709 100644 (file)
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@ -98,7 +98,7 @@ static const char *wcn[ewcNR] =
      "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
      "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
      "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
      "Vsite spread", "COM pull force",
      "Write traj.", "Update", "Constraints", "Comm. energies",
      "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
@@ -387,6 +387,12 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
  
      wcc = wc->wcc;
  
+    /* The GPU wait estimate counter is used for load balancing only
+     * and will mess up the total due to double counting: clear it.
+     */
+    wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+    wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+
      for (i = 0; i < ewcNR; i++)
      {
          if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index f91a455fb53bd395801341770208c803ff569c54..7b48849707ca4b3ab1d78e93f8b02c779decf5d7 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -50,7 +50,7 @@ enum {
      ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
      ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
      ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcWAIT_GPU_NB_L_EST, ewcNB_XF_BUF_OPS,
      ewcVSITESPREAD, ewcPULLPOT,
      ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
      ewcTEST, ewcNR
author	Berk Hess <hess@kth.se>
	Thu, 11 Sep 2014 13:02:14 +0000 (15:02 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Tue, 30 Sep 2014 11:10:22 +0000 (13:10 +0200)
src/gromacs/mdlib/sim_util.c		patch \| blob \| history
src/gromacs/timing/wallcycle.c		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history