#endif
}
-/*! \brief \libinternal
- * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
-{
- pme_gpu_clear_grids(pmeGpu);
- pme_gpu_clear_energy_virial(pmeGpu);
-}
-
/*! \brief \libinternal
* (Re-)initializes all the PME GPU data related to the grid size and cut-off.
*
pme_gpu_reinit_timings(pme->gpu);
pme_gpu_reinit_grids(pme->gpu);
- pme_gpu_reinit_computation(pme->gpu);
+ // Note: if timing the reinit launch overhead becomes more relevant
+ // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
+ pme_gpu_reinit_computation(pme, nullptr);
/* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
* update for mixed mode on grid switch. TODO: use shared recipbox field.
*/
PmeGpu *pmeGpu = pme->gpu;
- // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
+ // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
wallcycle_start(wcycle, ewcLAUNCH_GPU);
+ // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
pme_gpu_copy_input_coordinates(pmeGpu, x);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
{
const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
// Time the final staged data handling separately with a counting call to get
// the call count right.
wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-
- // The computation has completed, do timing accounting and resetting buffers
pme_gpu_update_timings(pme->gpu);
- // TODO: move this later and launch it together with the other
- // non-bonded tasks at the end of the step
- pme_gpu_reinit_computation(pme->gpu);
-
pme_gpu_get_staged_results(pme, forces, virial, energy);
-
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
return true;
{
pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
}
+
+void pme_gpu_reinit_computation(const gmx_pme_t *pme,
+ gmx_wallcycle *wcycle)
+{
+ GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+
+ pme_gpu_clear_grids(pme->gpu);
+ pme_gpu_clear_energy_virial(pme->gpu);
+
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+}
real *CUDA_FUNC_ARGUMENT(energy),
GpuTaskCompletion CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
+/*! \brief
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * Clears the internal grid and energy/virial buffers; it is not safe to start
+ * the PME computation without calling this.
+ * Note that unlike in the nbnxn module, the force buffer does not need clearing.
+ *
+ * \todo Rename this function to *clear* -- it clearly only does output resetting
+ * and we should be clear about what the function does..
+ *
+ * \param[in] pme The PME data structure.
+ * \param[in] wcycle The wallclock counter.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
#endif
wallcycle_stop(wcycle, ewcFORCE);
}
+ if (useGpuPme)
+ {
+ pme_gpu_reinit_computation(fr->pmedata, wcycle);
+ }
+
if (bUseGPU)
{
/* now clear the GPU outputs while we finish the step on the CPU */
}
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-
- // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
}
/* Do the nonbonded GPU (or emulation) force buffer reduction