Instead of handling the OpenCL host to device copy as a special case
where this operation is issued in the PME stream, as the "Update" stream
is anyway created unconditionally it makes sense to simplify codepaths
at the negligible cost of an extra GPU-side synchronization.
This allows unifying the coordinate handling in stateGpu as well as in
PME spread (with combined PP+PME) which removes some complexity and makes
it easier to reason about event consumption, reducing the need for balancing
workarounds.
The change also introduces a data member in StatePropagatorDataGpu to
indicate whether it is use on a separate PME rank, case in which marking
of the coordinate H2D event can be skipped as no event dependency is
required.
Refs #3988
gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
- GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
- "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
+ GMX_ASSERT(xReadyOnDevice || !pme->bPPnode, "Need a valid xReadyOnDevice on PP+PME ranks.");
GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
PmeGpu* pmeGpu = pme->gpu;
GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
PmeGpu* pmeGpu = pme->gpu;
"inconsistent atom data padding vs. spreading block size");
// Ensure that coordinates are ready on the device before launching spread;
"inconsistent atom data padding vs. spreading block size");
// Ensure that coordinates are ready on the device before launching spread;
- // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
- // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
- GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
+ // only needed on PP+PME ranks, not on separate PME ranks, in unit tests
+ // as these cases use a single stream (hence xReadyOnDevice == nullptr).
+ GMX_ASSERT(xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
|| pme_gpu_settings(pmeGpu).copyAllOutputs,
"Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
|| pme_gpu_settings(pmeGpu).copyAllOutputs,
"Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
//! \brief Pointer to wallcycle structure.
gmx_wallcycle* wcycle_;
//! \brief Pointer to wallcycle structure.
gmx_wallcycle* wcycle_;
+ //! Whether this instance of the class is used on a PME-only rank
+ bool isPmeOnly_ = false;
+
/*! \brief Performs the copy of data from host to device buffer.
*
* \todo Template on locality.
/*! \brief Performs the copy of data from host to device buffer.
*
* \todo Template on locality.
pmeStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
localStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
pmeStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
localStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
- // PME stream is used in OpenCL for H2D coordinate transfer
- updateStream_ = &deviceStreamManager.stream(
- GMX_GPU_OPENCL ? DeviceStreamType::Pme : DeviceStreamType::UpdateAndConstraints);
+ updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
// Map the atom locality to the stream that will be used for coordinates,
// velocities and forces transfers. Same streams are used for H2D and D2H copies.
// Map the atom locality to the stream that will be used for coordinates,
// velocities and forces transfers. Same streams are used for H2D and D2H copies.
nonLocalStream_ = nullptr;
updateStream_ = nullptr;
nonLocalStream_ = nullptr;
updateStream_ = nullptr;
// Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
// This it temporary measure to make it safe to use this class in those cases.
// Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
// This it temporary measure to make it safe to use this class in those cases.
copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
- // markEvent is skipped in OpenCL as:
- // - it's not needed, copy is done in the same stream as the only consumer task (PME)
- // - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
- // TODO: remove this by adding an event-mark free flavor of this function
- if (bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL))
+ // marking is skipped on the PME-rank mode as everything is on the same stream
+ if (!isPmeOnly_)
{
xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
}
{
xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
}
// and this is not a neighbor search step, then the consumer needs to wait for the update
// to complete. Otherwise, the coordinates are copied from the host and we need to wait for
// the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
// and this is not a neighbor search step, then the consumer needs to wait for the update
// to complete. Otherwise, the coordinates are copied from the host and we need to wait for
// the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
- //
- // In OpenCL no events are used as coordinate sync is not necessary
- if (GMX_GPU_OPENCL)
- {
- return nullptr;
- }
if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
{
GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
{
GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,