void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial)
{
+ for (int j = 0; j < c_virialAndEnergyCount; j++)
+ {
+ GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
+ }
+
GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU");
unsigned int j = 0;
virial[XX][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
pme_gpu_clear_energy_virial(pmeGPU);
}
-void pme_gpu_finish_step(const PmeGpu *pmeGPU, const bool bCalcF, const bool bCalcEnerVir)
+void pme_gpu_finish_step(const PmeGpu *pmeGPU)
{
- if (bCalcF && pme_gpu_performs_gather(pmeGPU))
- {
- pme_gpu_sync_output_forces(pmeGPU);
- }
- if (bCalcEnerVir && pme_gpu_performs_solve(pmeGPU))
- {
- pme_gpu_sync_output_energy_virial(pmeGPU);
- }
+ // Synchronize the whole PME stream at once, including D2H result transfers.
+ // If forces were computed, they will have arrived at the external host buffer provided to gather.
+ // If virial/energy were computed, they will have arrived into the internal staging buffer
+ // (even though that should have already happened before even launching the gather).
+ // Finally, cudaEvent_t based GPU timers need stream synchronization for correctness.
+ pme_gpu_synchronize(pmeGPU);
+
pme_gpu_update_timings(pmeGPU);
pme_gpu_reinit_step(pmeGPU);
}
CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU),
float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
-/*! \libinternal \brief
- * Waits for the PME GPU output forces copying to the CPU buffer to finish.
- *
- * \param[in] pmeGPU The PME GPU structure.
- */
-CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
-
/*! \libinternal \brief
* Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
*
*/
CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
-/*! \libinternal \brief
- * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
- *
- * \param[in] pmeGPU The PME GPU structure.
- */
-CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
-
/*! \libinternal \brief
* Copies the input real-space grid from the host to the GPU.
*
* Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
*
* \param[in] pmeGPU The PME GPU structure.
- * \param[in] bCalcForces The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
- * \param[in] bCalcEnerVir The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
*/
-void pme_gpu_finish_step(const PmeGpu *pmeGPU, const bool bCalcForces,
- const bool bCalcEnerVir);
+void pme_gpu_finish_step(const PmeGpu *pmeGPU);
//! A binary enum for spline data layout transformation
enum class PmeLayoutTransform
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
- const bool haveComputedForces = pme->gpu->settings.stepFlags & GMX_PME_CALC_F;
wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
- pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial);
+ pme_gpu_finish_step(pme->gpu);
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
if (haveComputedEnergyAndVirial)
{
cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
c_virialAndEnergyCount * sizeof(float), stream);
- cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncEnerVirD2H, stream);
- CU_RET_ERR(stat, "PME solve energy/virial event record failure");
}
if (copyInputAndOutputGrid)
{
cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
- cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSolveGridD2H, stream);
- CU_RET_ERR(stat, "PME solve grid sync event record failure");
}
}
{
if (pme_gpu_timings_enabled(pmeGPU))
{
- pme_gpu_synchronize(pmeGPU);
-
for (const size_t &activeTimer : pmeGPU->archSpecific->activeTimers)
{
pmeGPU->archSpecific->timingEvents[activeTimer].getLastRangeTime();
const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
- cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
- CU_RET_ERR(stat, "PME gather forces synchronization failure");
-}
-
-void pme_gpu_sync_output_forces(const PmeGpu *pmeGPU)
-{
- cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H);
- CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
}
void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
pmeGPU->deviceInfo);
}
-void pme_gpu_sync_output_energy_virial(const PmeGpu *pmeGPU)
-{
- cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H);
- CU_RET_ERR(stat, "Error while waiting for PME solve output");
-
- for (int j = 0; j < c_virialAndEnergyCount; j++)
- {
- GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
- }
-}
-
void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
{
const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
- cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
- CU_RET_ERR(stat, "PME spread atom data sync event record failure");
}
void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
}
-void pme_gpu_sync_spline_atom_data(const PmeGpu *pmeGPU)
-{
- cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H);
- CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
-}
-
-void pme_gpu_sync_solve_grid(const PmeGpu *pmeGPU)
-{
- cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H);
- CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
- //should check for pme_gpu_performs_solve(pmeGPU)
-}
-
void pme_gpu_init_internal(PmeGpu *pmeGPU)
{
/* Allocate the target-specific structures */
void pme_gpu_init_sync_events(const PmeGpu *pmeGPU)
{
- cudaError_t stat;
const auto eventFlags = cudaEventDisableTiming;
- stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags);
- CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
- stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags);
- CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
- stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags);
- CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
- stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags);
- CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
- stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags);
- CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
+ CU_RET_ERR(cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
}
void pme_gpu_destroy_sync_events(const PmeGpu *pmeGPU)
{
- cudaError_t stat;
- stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
- CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
- stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
- CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
- stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
- CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
- stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
- CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
- stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
- CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
+ CU_RET_ERR(cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
}
void pme_gpu_reinit_3dfft(const PmeGpu *pmeGPU)
cudaStream_t pmeStream;
/* Synchronization events */
- /*! \brief Triggered after the energy/virial have been copied to the host (after the solving stage). */
- cudaEvent_t syncEnerVirD2H;
- /*! \brief Triggered after the output forces have been copied to the host (after the gathering stage). */
- cudaEvent_t syncForcesD2H;
/*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
cudaEvent_t syncSpreadGridD2H;
- /*! \brief Triggered after the atom spline data has been copied to the host (after the spline computation). */
- cudaEvent_t syncSplineAtomDataD2H;
- /*! \brief Triggered after the grid hes been copied to the host (after the solving stage) */
- cudaEvent_t syncSolveGridD2H;
// TODO: consider moving some things below into the non-CUDA struct.