"DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
- "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
+ "PME wait for PP", "Wait + Recv. PME F", "Wait PME GPU spread", "Wait PME GPU gather", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
"Vsite spread", "COM pull force",
"Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
"NB F buffer ops.",
};
+/* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
+static const char *PMEStageNames[] =
+{
+ "Spline",
+ "Spread",
+ "Spline/spread",
+ "FFT r2c",
+ "Solve",
+ "FFT c2r",
+ "Gather",
+};
+
gmx_bool wallcycle_have_counter(void)
{
return gmx_cycles_have_counter();
void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
int nth_pp, int nth_pme, double realtime,
gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
- struct gmx_wallclock_gpu_t *gpu_t)
+ const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
+ const gmx_wallclock_gpu_pme_t *gpu_pme_t)
{
- double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
+ double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
double c2t, c2t_pp, c2t_pme = 0;
int i, j, npp, nth_tot;
char buf[STRLEN];
}
/* print GPU timing summary */
- if (gpu_t)
+ double tot_gpu = 0.0;
+ if (gpu_pme_t)
+ {
+ for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+ {
+ tot_gpu += gpu_pme_t->timing[k].t;
+ }
+ }
+ if (gpu_nbnxn_t)
{
const char *k_log_str[2][2] = {
{"Nonbonded F kernel", "Nonbonded F+ene k."},
{"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
};
-
- tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
+ tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
/* add up the kernel timings */
- tot_k = 0.0;
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
- tot_k += gpu_t->ktime[i][j].t;
+ tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
}
}
- tot_gpu += tot_k + gpu_t->pruneTime.t;
+ tot_gpu += gpu_nbnxn_t->pruneTime.t;
tot_cpu_overlap = wc->wcc[ewcFORCE].c;
if (wc->wcc[ewcPMEMESH].n > 0)
fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%');
fprintf(fplog, "%s\n", hline);
print_gputimes(fplog, "Pair list H2D",
- gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
+ gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
print_gputimes(fplog, "X / q H2D",
- gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
+ gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
- if (gpu_t->ktime[i][j].c)
+ if (gpu_nbnxn_t->ktime[i][j].c)
{
print_gputimes(fplog, k_log_str[i][j],
- gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
+ gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
}
}
}
- if (gpu_t->pruneTime.c)
+ if (gpu_pme_t)
{
- print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+ for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+ {
+ if (gpu_pme_t->timing[k].c)
+ {
+ print_gputimes(fplog, PMEStageNames[k],
+ gpu_pme_t->timing[k].c,
+ gpu_pme_t->timing[k].t,
+ tot_gpu);
+ }
+ }
}
- print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
+ if (gpu_nbnxn_t->pruneTime.c)
+ {
+ print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
+ }
+ print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
fprintf(fplog, "%s\n", hline);
- print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
+ print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
fprintf(fplog, "%s\n", hline);
- if (gpu_t->dynamicPruneTime.c)
+ if (gpu_nbnxn_t->dynamicPruneTime.c)
{
/* We print the dynamic pruning kernel timings after a separator
* and avoid adding it to tot_gpu as this is not in the force
* overlap. We print the fraction as relative to the rest.
*/
- print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+ print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
fprintf(fplog, "%s\n", hline);
}
-
gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
- if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+ if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
{
+ // FIXME the code below is not updated for PME on GPU
fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
- tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
+ tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
gpu_cpu_ratio);
}