- Explicit consumption of events when needed.
- In some cases, reset the event in case the conditionals
for the consumption are too complex.
- Added helper functions to StatePropagatorDataGpu.
- A special handling for force reduction when it has no atoms.
Refs #3988, #4106, #4227.
baseForce_,
cellInfo_.d_cell,
deviceStream_);
-
- // Mark that kernel has been launched
- if (completionMarker_ != nullptr)
+ }
+ else
+ {
+ /* In case we have nothing to do, but still have dependencies, we need
+ * to consume them and mark our own event.
+ * Happens sometimes in MdrunVsitesTest.
+ * Issue #3988, #4227. */
+ for (auto* synchronizer : dependencyList_)
{
- completionMarker_->markEvent(deviceStream_);
+ synchronizer->consume();
}
}
+ /* Mark that kernel has been launched.
+ * Even if we have no work to do and have not launched the kernel, we still mark the event
+ * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */
+ if (completionMarker_ != nullptr)
+ {
+ completionMarker_->markEvent(deviceStream_);
+ }
+
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
{
GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+ if (stepWork.doNeighborSearch)
+ {
+ /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+ * communications, we don't wait for the coordinates on the device,
+ * and we must consume the event here.
+ * Issue #3988. */
+ const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+ const bool eventWillBeConsumedByGpuPmePPComm =
+ (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+ && pmeSendCoordinatesFromGpu;
+ if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+ {
+ stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+ }
+ }
}
}
if (!stepWork.useGpuFHalo)
{
+ /* We don't explicitly wait for the forces to be reduced on device,
+ * but wait for them to finish copying to CPU instead.
+ * So, we manually consume the event, see Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
// copy from GPU input for dd_move_f()
stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
AtomLocality::NonLocal);
|| (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
|| vsite)
{
+ if (stepWork.computeNonbondedForces)
+ {
+ /* We have previously issued force reduction on the GPU, but we will
+ * not use this event, instead relying on the stream being in-order.
+ * Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+ }
stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
}
// TODO: The special case of removing CM motion should be dealt more gracefully
if (useGpuForUpdate)
{
+ // Issue #3988, #4106.
+ stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
// Here we block until the H2D copy completes because event sync with the
// force kernels that use the coordinates on the next steps is not implemented
*/
void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+ /*! \brief Consume the event for copying coordinates to the device.
+ *
+ * Used for manual event consumption. Does nothing except changing the internal event counter.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+ /*! \brief Reset the event for copying coordinates to the device.
+ *
+ * Used for manual event consumption. Does nothing except resetting the event.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Setter for the event synchronizer for the update is done on th GPU
*
* \param[in] xUpdatedOnDeviceEvent The event to synchronize the stream coordinates wre updated on device.
*/
GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+ /*! \brief Consume the event for when the forces are reduced on the GPU.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
*
* \param[in] atomLocality Locality of the particles to wait for.
"GPU implementation.");
}
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub method from GPU state propagator data was called instead of one from "
+ "GPU implementation.");
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub method from GPU state propagator data was called instead of one from "
+ "GPU implementation.");
+}
+
void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* /* xUpdatedOnDeviceEvent */)
{
GMX_ASSERT(!impl_,
return nullptr;
}
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality /*atomLocality*/)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub method from GPU state propagator data was called instead of one from "
+ "GPU implementation.");
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
{
GMX_ASSERT(!impl_,
*/
void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+ /*! \brief Consume the event for copying coordinates to the device.
+ *
+ * Used for manual event consumption. Does nothing except changing the internal event counter.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+ /*! \brief Reset the event for copying coordinates to the device.
+ *
+ * Used for manual event consumption. Does nothing except resetting the event.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Setter for the event synchronizer for the update is done on th GPU
*
* \param[in] xUpdatedOnDeviceEvent The event to synchronize the stream coordinates wre updated on device.
*/
GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+ //! \brief Consume the event for when the forces are reduced on device.
+ void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
*
* \param[in] atomLocality Locality of the particles to wait for.
}
else
{
+ if (stepWork.doNeighborSearch && xUpdatedOnDeviceEvent_)
+ {
+ /* On search steps, we do not consume the result of the GPU update
+ * but rather that of a H2D transfer. So, we reset the event triggered after
+ * update to avoid leaving it unconsumed.
+ * Unfortunately, we don't always have the event marked either (e.g., on the
+ * first step) so we just reset it here.
+ * See Issue #3988. */
+ xUpdatedOnDeviceEvent_->reset();
+ }
return &xReadyOnDevice_[atomLocality];
}
}
wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
}
+void StatePropagatorDataGpu::Impl::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ xReadyOnDevice_[atomLocality].consume();
+}
+
+void StatePropagatorDataGpu::Impl::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ xReadyOnDevice_[atomLocality].reset();
+}
+
void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
{
GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr.");
return &fReducedOnDevice_[atomLocality];
}
+void StatePropagatorDataGpu::Impl::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+ fReducedOnDevice_[atomLocality].consume();
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
{
return &fReadyOnDevice_[atomLocality];
return impl_->waitCoordinatesCopiedToDevice(atomLocality);
}
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+ return impl_->consumeCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+ return impl_->resetCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
{
impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
return impl_->fReducedOnDevice(atomLocality);
}
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+ impl_->consumeForcesReducedOnDeviceEvent(atomLocality);
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
{
return impl_->fReadyOnDevice(atomLocality);