{
GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+ if (stepWork.doNeighborSearch)
+ {
+ /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+ * communications, we don't wait for the coordinates on the device,
+ * and we must consume the event here.
+ * Issue #3988. */
+ const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+ const bool eventWillBeConsumedByGpuPmePPComm =
+ (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+ && pmeSendCoordinatesFromGpu;
+ if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+ {
+ stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+ }
+ }
}
}
if (!stepWork.useGpuFHalo)
{
+ /* We don't explicitly wait for the forces to be reduced on device,
+ * but wait for them to finish copying to CPU instead.
+ * So, we manually consume the event, see Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
// copy from GPU input for dd_move_f()
stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
AtomLocality::NonLocal);
|| (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
|| vsite)
{
+ if (stepWork.computeNonbondedForces)
+ {
+ /* We have previously issued force reduction on the GPU, but we will
+ * not use this event, instead relying on the stream being in-order.
+ * Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+ }
stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
}