launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
}
- communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+ communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
return;
}
{
// Communicate halo data (in non-local stream)
- communicateHaloData(d_f_, HaloQuantity::HaloForces);
+ communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
float3* d_f = d_f_;
}
-void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity)
+void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr,
+ HaloQuantity haloQuantity,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
void* sendPtr;
recvRank = recvRankX_;
#if GMX_MPI
- // Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
- char thisTaskIsReady, remoteTaskIsReady;
- MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0, &remoteTaskIsReady,
- sizeof(char), MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+ // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
+ // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
+ // Similarly send event to task that will push data to this task.
+ GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
+ MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*), MPI_BYTE,
+ recvRank, 0, &remoteCoordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*),
+ MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+ remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
#endif
}
else
/*! \brief Data transfer wrapper for GPU halo exchange
* \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory
* \param [in] haloQuantity switch on whether X or F halo exchange is being performed
+ * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
*/
- void communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity);
+ void communicateHaloData(float3* d_ptr,
+ HaloQuantity haloQuantity,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief Data transfer for GPU halo exchange using CUDA memcopies
* \param [inout] sendPtr address to send data from