Rename GPU launch/wait cycle counters
authorSzilárd Páll <pall.szilard@gmail.com>
Thu, 21 Sep 2017 14:13:43 +0000 (16:13 +0200)
committerBerk Hess <hess@kth.se>
Tue, 26 Sep 2017 08:08:35 +0000 (10:08 +0200)
In preparation for the PME GPU task and GPU launch overhead to be
counted together in the same counter for all GPU tasks, the current main
counters have been renamed to be more general. The label of GPU waits in
the performance table have also been renamed to reflect the task name.
Additionally a non-bonded specific sub-counter is been added.

Change-Id: I65a15b0090c1ccebb300cf425c7b3be4100e17a0

src/gromacs/mdlib/sim_util.cpp
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index 4d4114ed4bdcda8df6415abe9263ffcc8b8674ef..2cbe8163ff9bb585ba5ec99d0bb5bc3f5f122365 100644 (file)
@@ -897,16 +897,18 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     /* initialize the GPU atom data and copy shift vector */
     if (bUseGPU)
     {
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
         if (bNS)
         {
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
             nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
         }
 
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
         nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
     /* do local pair search */
@@ -955,11 +957,13 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             ddOpenBalanceRegionGpu(cr->dd);
         }
 
-        wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
-        /* launch local nonbonded F on GPU */
+        wallcycle_start(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        /* launch local nonbonded work on GPU */
         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
                      step, nrnb, wcycle);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
     /* Communicate coordinates and sum dipole if necessary +
@@ -1034,18 +1038,21 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
         if (bUseGPU && !bDiffKernels)
         {
-            wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
-            /* launch non-local nonbonded F on GPU */
+            wallcycle_start(wcycle, ewcLAUNCH_GPU);
+            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            /* launch non-local nonbonded tasks on GPU */
             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
                          step, nrnb, wcycle);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
     }
 
     if (bUseGPU)
     {
         /* launch D2H copy-back F */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         if (DOMAINDECOMP(cr) && !bDiffKernels)
         {
             nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
@@ -1053,7 +1060,8 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         }
         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
                                  flags, eatLocal);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
     if (bStateChanged && inputrecNeedMutot(inputrec))
@@ -1340,7 +1348,8 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             }
 
             /* now clear the GPU outputs while we finish the step on the CPU */
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
 
             /* Is dynamic pair-list pruning activated? */
@@ -1365,7 +1374,8 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                                                       numRollingParts);
                 }
             }
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
         else
         {
index 637c8d7dac37a54eeb2912397032215e0a06c22b..efa296bdffa351e5f22cd077a2ac532a2e890fb3 100644 (file)
@@ -105,7 +105,7 @@ static const char *wcn[ewcNR] =
     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
     "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
     "Vsite spread", "COM pull force",
     "Write traj.", "Update", "Constraints", "Comm. energies",
     "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
@@ -122,6 +122,7 @@ static const char *wcsn[ewcsNR] =
     "Listed buffer ops.",
     "Nonbonded pruning",
     "Nonbonded F",
+    "Launch NB GPU tasks",
     "Ewald F correction",
     "NB X buffer ops.",
     "NB F buffer ops.",
index b1325e21d08fca7436af12decb1850dc35c69b0e..8ad165514a89f4d68348113a212884c93f00e983 100644 (file)
@@ -49,7 +49,7 @@ struct t_commrec;
 
 enum {
     ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
-    ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
+    ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU,
     ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
     ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
@@ -70,6 +70,7 @@ enum {
     ewcsLISTED_BUF_OPS,
     ewcsNONBONDED_PRUNING,
     ewcsNONBONDED,
+    ewcsLAUNCH_GPU_NONBONDED,
     ewcsEWALD_CORRECTION,
     ewcsNB_X_BUF_OPS,
     ewcsNB_F_BUF_OPS,