"DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
- "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
- "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
+ "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
+ "Vsite spread", "COM pull force",
+ "Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
};
wcc = wc->wcc;
+ /* The GPU wait estimate counter is used for load balancing only
+ * and will mess up the total due to double counting: clear it.
+ */
+ wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
+ wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
+
for (i = 0; i < ewcNR; i++)
{
if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
fprintf(fplog, "\n\n");
fprintf(fplog, " Computing: Num Num Call Wall time Giga-Cycles\n");
- fprintf(fplog, " Nodes Threads Count (s) total sum %%\n");
+ fprintf(fplog, " Ranks Threads Count (s) total sum %%\n");
}
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
if (npme > 0)
{
fprintf(fplog,
- "(*) Note that with separate PME nodes, the walltime column actually sums to\n"
+ "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
" twice the total reported, but the cycle count total and %% are correct.\n"
"%s\n", hline);
}