/* initialize the GPU atom data and copy shift vector */
if (bUseGPU)
{
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
if (bNS)
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
/* do local pair search */
ddOpenBalanceRegionGpu(cr->dd);
}
- wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
- /* launch local nonbonded F on GPU */
+ wallcycle_start(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ /* launch local nonbonded work on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
step, nrnb, wcycle);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
/* Communicate coordinates and sum dipole if necessary +
if (bUseGPU && !bDiffKernels)
{
- wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
- /* launch non-local nonbonded F on GPU */
+ wallcycle_start(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ /* launch non-local nonbonded tasks on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
step, nrnb, wcycle);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
}
if (bUseGPU)
{
/* launch D2H copy-back F */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (DOMAINDECOMP(cr) && !bDiffKernels)
{
nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
}
nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
flags, eatLocal);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
if (bStateChanged && inputrecNeedMutot(inputrec))
}
/* now clear the GPU outputs while we finish the step on the CPU */
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
/* Is dynamic pair-list pruning activated? */
numRollingParts);
}
}
- wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
else
{
"DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
- "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
+ "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
"Vsite spread", "COM pull force",
"Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
"Listed buffer ops.",
"Nonbonded pruning",
"Nonbonded F",
+ "Launch NB GPU tasks",
"Ewald F correction",
"NB X buffer ops.",
"NB F buffer ops.",
enum {
ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
- ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU_NB,
+ ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU,
ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
ewcsLISTED_BUF_OPS,
ewcsNONBONDED_PRUNING,
ewcsNONBONDED,
+ ewcsLAUNCH_GPU_NONBONDED,
ewcsEWALD_CORRECTION,
ewcsNB_X_BUF_OPS,
ewcsNB_F_BUF_OPS,