// Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
// the force accumulation stage before syncing with the local stream. Only done in CUDA and
// SYCL, since the force buffer ops are not implemented in OpenCL.
// Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
// the force accumulation stage before syncing with the local stream. Only done in CUDA and
// SYCL, since the force buffer ops are not implemented in OpenCL.
{
clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
}
{
clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
}
// - it's not needed, copy is done in the same stream as the only consumer task (PME)
// - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
// TODO: remove this by adding an event-mark free flavor of this function
// - it's not needed, copy is done in the same stream as the only consumer task (PME)
// - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
// TODO: remove this by adding an event-mark free flavor of this function