The event synchronizer indicating that coordinates are ready in the GPU
is now passed to the two tasks that depend on this input: PME and
X buffer ops. Both enqueue a wait on the passed event prior to kernel
launch to ensure that the coordinates are ready before the kernels
start executing.
On the separate PME ranks and in tests, as we use a single stream,
no synchronization is necessary.
With the on-device sync in place, this change also removes the
streamSynchronize call from copyCoordinatesToGpu.
Refs. #2816, #3126.
Change-Id: I3457f01f44ca6d6ad08e0118d8b1def2ab0b381b
14 files changed:
* Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
* Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] wcycle The wallclock counter.
*/
* \param[in] wcycle The wallclock counter.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
- gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
}
void pme_gpu_launch_spread(gmx_pme_t *pme,
}
void pme_gpu_launch_spread(gmx_pme_t *pme,
+ GpuEventSynchronizer *xReadyOnDevice,
gmx_wallcycle *wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
gmx_wallcycle *wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
const bool spreadCharges = true;
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
const bool spreadCharges = true;
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
- pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+ pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
return std::pair<int, int>(colCount, minRowCount);
}
return std::pair<int, int>(colCount, minRowCount);
}
-void pme_gpu_spread(const PmeGpu *pmeGpu,
- int gmx_unused gridIndex,
- real *h_grid,
- bool computeSplines,
- bool spreadCharges)
+void pme_gpu_spread(const PmeGpu *pmeGpu,
+ GpuEventSynchronizer *xReadyOnDevice,
+ int gmx_unused gridIndex,
+ real *h_grid,
+ bool computeSplines,
+ bool spreadCharges)
{
GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
{
GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+ // Ensure that coordinates are ready on the device before launching spread;
+ // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+ // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+ // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+ // here separate PME ranks.
+ if (xReadyOnDevice)
+ {
+ xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+ }
+
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
* A GPU spline computation and charge spreading function.
*
* \param[in] pmeGpu The PME GPU structure.
* A GPU spline computation and charge spreading function.
*
* \param[in] pmeGpu The PME GPU structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
+ * can be nullptr when invoked on a separate PME rank or from PME tests.
* \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
* \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
* e.g. testing or host-side FFT)
* \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
* \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
*/
* \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
* \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
* e.g. testing or host-side FFT)
* \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
* \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
- int GPU_FUNC_ARGUMENT(gridIndex),
- real *GPU_FUNC_ARGUMENT(h_grid),
- bool GPU_FUNC_ARGUMENT(computeSplines),
- bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ int GPU_FUNC_ARGUMENT(gridIndex),
+ real *GPU_FUNC_ARGUMENT(h_grid),
+ bool GPU_FUNC_ARGUMENT(computeSplines),
+ bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
// or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
// or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+ // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
+ auto xReadyOnDevice = nullptr;
- pme_gpu_launch_spread(pme, wcycle);
+ pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
break;
case CodePath::GPU:
break;
case CodePath::GPU:
- pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
- break;
+ {
+ // no synchronization needed as x is transferred in the PME stream
+ GpuEventSynchronizer *xReadyOnDevice = nullptr;
+ pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+ }
+ break;
default:
GMX_THROW(InternalError("Test not implemented for this mode"));
default:
GMX_THROW(InternalError("Test not implemented for this mode"));
* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] wcycle The wallcycle structure
*/
* \param[in] wcycle The wallcycle structure
*/
-static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
- const matrix box,
- const StepWorkload &stepWork,
- int pmeFlags,
- gmx_wallcycle_t wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
+ const matrix box,
+ const StepWorkload &stepWork,
+ int pmeFlags,
+ GpuEventSynchronizer *xReadyOnDevice,
+ gmx_wallcycle_t wcycle)
{
pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
{
pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
- pme_gpu_launch_spread(pmedata, wcycle);
+ pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
}
/*! \brief Launch the FFT and gather stages of PME GPU
}
/*! \brief Launch the FFT and gather stages of PME GPU
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
+ const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+ simulationWork, stepWork) : nullptr;
if (useGpuPmeOnThisRank)
{
if (useGpuPmeOnThisRank)
{
- launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
+ localXReadyOnDevice, wcycle);
}
/* do gridding for pair search */
}
/* do gridding for pair search */
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
- stateGpu->getCoordinates());
+ stateGpu->getCoordinates(),
+ localXReadyOnDevice);
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
- stateGpu->getCoordinates());
+ stateGpu->getCoordinates(),
+ stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+ simulationWork, stepWork));
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
- GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
// TODO: Refactor when the StreamManager is introduced.
if (GMX_GPU == GMX_GPU_OPENCL)
// TODO: Refactor when the StreamManager is introduced.
if (GMX_GPU == GMX_GPU_OPENCL)
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
- GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
if (GMX_GPU == GMX_GPU_OPENCL)
{
if (GMX_GPU == GMX_GPU_OPENCL)
{
if (GMX_GPU == GMX_GPU_CUDA)
{
xReadyOnDevice_[atomLocality].markEvent(commandStream);
if (GMX_GPU == GMX_GPU_CUDA)
{
xReadyOnDevice_[atomLocality].markEvent(commandStream);
- // TODO: Remove When event-based synchronization is introduced
- gpuStreamSynchronize(commandStream);
//
// TODO: This should be reconsidered to support the halo exchange.
//
//
// TODO: This should be reconsidered to support the halo exchange.
//
+ // In OpenCL no events are used as coordinate sync is not necessary
+ if (GMX_GPU == GMX_GPU_OPENCL)
+ {
+ return nullptr;
+ }
if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
{
return &xUpdatedOnDevice_;
if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
{
return &xUpdatedOnDevice_;
}
/* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
}
/* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
- const Nbnxm::AtomLocality locality,
- bool fillLocal,
- gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> d_x)
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
+ const Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice)
fillLocal && g == 0,
gpu_nbv,
d_x,
fillLocal && g == 0,
gpu_nbv,
d_x,
locality,
g,
gridSet.numColumnsMax());
locality,
g,
gridSet.numColumnsMax());
* \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed.
* \param[in,out] gpu_nbv The NBNXM GPU data structure.
* \param[in] d_x Coordinates to be copied (in plain rvec format).
* \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed.
* \param[in,out] gpu_nbv The NBNXM GPU data structure.
* \param[in] d_x Coordinates to be copied (in plain rvec format).
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
- Nbnxm::AtomLocality locality,
- bool fillLocal,
- gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> d_x);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
+ Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice);
/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
*
/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
*
bool setFillerCoords,
gmx_nbnxn_gpu_t *nb,
DeviceBuffer<float> d_x,
bool setFillerCoords,
gmx_nbnxn_gpu_t *nb,
DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice,
const Nbnxm::AtomLocality locality,
int gridId,
int numColumnsMax)
const Nbnxm::AtomLocality locality,
int gridId,
int numColumnsMax)
// TODO: This will only work with CUDA
GMX_ASSERT(d_x, "Need a valid device pointer");
// TODO: This will only work with CUDA
GMX_ASSERT(d_x, "Need a valid device pointer");
+ // ensure that coordinates are ready on the device before launching the kernel
+ GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
+ xReadyOnDevice->enqueueWaitEvent(stream);
+
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
config.blockSize[1] = 1;
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
config.blockSize[1] = 1;
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
-void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality,
- const bool fillLocal,
- DeviceBuffer<float> d_x)
+void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality,
+ const bool fillLocal,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice)
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
gpu_nbv,
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
gpu_nbv,
+ d_x,
+ xReadyOnDevice);
wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
*
* The API function for the transformation of the coordinates from one layout to another in the GPU memory.
*
*
* The API function for the transformation of the coordinates from one layout to another in the GPU memory.
*
- * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
- * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
- * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
+ * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
+ * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
+ * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
- void convertCoordinatesGpu(Nbnxm::AtomLocality locality,
- bool fillLocal,
- DeviceBuffer<float> d_x);
+ void convertCoordinatesGpu(Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
* \param[in] setFillerCoords If the filler coordinates are used.
* \param[in,out] gpu_nbv The nonbonded data GPU structure.
* \param[in] d_x Device-side coordinates in plain rvec format.
* \param[in] setFillerCoords If the filler coordinates are used.
* \param[in,out] gpu_nbv The nonbonded data GPU structure.
* \param[in] d_x Device-side coordinates in plain rvec format.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] locality Copy coordinates for local or non-local atoms.
* \param[in] gridId Index of the grid being converted.
* \param[in] numColumnsMax Maximum number of columns in the grid.
*/
CUDA_FUNC_QUALIFIER
* \param[in] locality Copy coordinates for local or non-local atoms.
* \param[in] gridId Index of the grid being converted.
* \param[in] numColumnsMax Maximum number of columns in the grid.
*/
CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
- bool gmx_unused setFillerCoords,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- DeviceBuffer<float> gmx_unused d_x,
- Nbnxm::AtomLocality gmx_unused locality,
- int gmx_unused gridId,
- int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
+ bool gmx_unused setFillerCoords,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ DeviceBuffer<float> gmx_unused d_x,
+ GpuEventSynchronizer gmx_unused *xReadyOnDevice,
+ Nbnxm::AtomLocality gmx_unused locality,
+ int gmx_unused gridId,
+ int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
* \param[in] nb The nonbonded data GPU structure
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
* \param[in] nb The nonbonded data GPU structure