nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
+ if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
+ {
+ fr->gpuBonded->launchEnergyTransfer();
+ }
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- /* Do the nonbonded GPU (or emulation) force buffer reduction
- * on the non-alternating path. */
- if (bUseOrEmulGPU && !alternateGpuWait)
- {
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
- nbv->nbat, f, wcycle);
- }
-
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
{
+ wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
+ // in principle this should be included in the DD balancing region,
+ // but generally it is infrequent so we'll omit it for the sake of
+ // simpler code
+ fr->gpuBonded->accumulateEnergyTerms(enerd);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
+
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
- fr->gpuBonded->launchEnergyTransfer();
- fr->gpuBonded->accumulateEnergyTerms(enerd);
- // TODO The clearing call could come later in the
- // force-calculation sequence.
fr->gpuBonded->clearEnergies();
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
+ * on the non-alternating path. */
+ if (bUseOrEmulGPU && !alternateGpuWait)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbv->nbat, f, wcycle);
+ }
if (DOMAINDECOMP(cr))
{
dd_force_flop_stop(cr->dd, nrnb);
"PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
"PME wait for PP", "Wait + Recv. PME F",
"Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
- "Wait PME GPU gather", "Reduce GPU PME F",
+ "Wait PME GPU gather", "Wait Bonded GPU", "Reduce GPU PME F",
"Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
"Vsite spread", "COM pull force", "AWH",
"Write traj.", "Update", "Constraints", "Comm. energies",
ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF,
ewcWAIT_GPU_PME_SPREAD, ewcPME_FFT_MIXED_MODE, ewcPME_SOLVE_MIXED_MODE,
- ewcWAIT_GPU_PME_GATHER, ewcPME_GPU_F_REDUCTION,
+ ewcWAIT_GPU_PME_GATHER, ewcWAIT_GPU_BONDED, ewcPME_GPU_F_REDUCTION,
ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
ewcVSITESPREAD, ewcPULLPOT, ewcAWH,
ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,