Tweak the bonded GPU energy copy/wait operations
authorSzilárd Páll <pall.szilard@gmail.com>
Mon, 10 Dec 2018 00:09:13 +0000 (01:09 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 11 Dec 2018 08:57:38 +0000 (09:57 +0100)
Move the launch of device to host copy earlier and add wait cycle
counter around the operation that may block and lead to time spent
waiting rather than just a launch.

Change-Id: I241001fc0b60a8ae994f173bc3b38f0c4bbd8fff

src/gromacs/mdlib/sim_util.cpp
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index e5b269f6c8cbd4a400102c52698eda2634610095..5205830b050fc72a978fc1cf109e321336de9d56 100644 (file)
@@ -1385,6 +1385,13 @@ static void do_force_cutsVERLET(FILE *fplog,
         nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
                                  flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
+        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
+        if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
+        {
+            fr->gpuBonded->launchEnergyTransfer();
+        }
+        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
@@ -1699,27 +1706,29 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
-    /* Do the nonbonded GPU (or emulation) force buffer reduction
-     * on the non-alternating path. */
-    if (bUseOrEmulGPU && !alternateGpuWait)
-    {
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
-                                       nbv->nbat, f, wcycle);
-    }
-
     if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
     {
+        wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
+        // in principle this should be included in the DD balancing region,
+        // but generally it is infrequent so we'll omit it for the sake of
+        // simpler code
+        fr->gpuBonded->accumulateEnergyTerms(enerd);
+        wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
+
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
-        fr->gpuBonded->launchEnergyTransfer();
-        fr->gpuBonded->accumulateEnergyTerms(enerd);
-        // TODO The clearing call could come later in the
-        // force-calculation sequence.
         fr->gpuBonded->clearEnergies();
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 
+    /* Do the nonbonded GPU (or emulation) force buffer reduction
+     * on the non-alternating path. */
+    if (bUseOrEmulGPU && !alternateGpuWait)
+    {
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+                                       nbv->nbat, f, wcycle);
+    }
     if (DOMAINDECOMP(cr))
     {
         dd_force_flop_stop(cr->dd, nrnb);
index 99da970cdaeb8c43a61b1831c7ce0502004e0aca..92ce8cb343cb6269564c9e1c1225618fb4f81efb 100644 (file)
@@ -109,7 +109,7 @@ static const char *wcn[ewcNR] =
     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
     "PME wait for PP", "Wait + Recv. PME F",
     "Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
-    "Wait PME GPU gather", "Reduce GPU PME F",
+    "Wait PME GPU gather", "Wait Bonded GPU", "Reduce GPU PME F",
     "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
     "Vsite spread", "COM pull force", "AWH",
     "Write traj.", "Update", "Constraints", "Comm. energies",
index 123feda178a9a00da382ad58ccd5fe00d9eb6366..9468a0b60afd9fa6ef00973c3909bfb6c72eec42 100644 (file)
@@ -55,7 +55,7 @@ enum {
     ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
     ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF,
     ewcWAIT_GPU_PME_SPREAD, ewcPME_FFT_MIXED_MODE, ewcPME_SOLVE_MIXED_MODE,
-    ewcWAIT_GPU_PME_GATHER, ewcPME_GPU_F_REDUCTION,
+    ewcWAIT_GPU_PME_GATHER, ewcWAIT_GPU_BONDED, ewcPME_GPU_F_REDUCTION,
     ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
     ewcVSITESPREAD, ewcPULLPOT, ewcAWH,
     ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,