With GPU update, we should wait for the coordinates to be copied to host
before using them for calculating the dipole moment.
This did not seem to cause any issues with CUDA but was causing a unit
test failure with SYCL (with !1329):
SYCL_BE=PI_OPENCL GMX_USE_GPU_BUFFER_OPS=1 GMX_FORCE_UPDATE_DEFAULT_GPU=1 ./bin/mdrun-test --gtest_filter=EwaldSurfaceTerm/EwaldSurfaceTermTest.WithinTolerances/0
Refs #3930, #3932
bool gmx_used_in_debug haveCopiedXFromGpu = false;
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
bool gmx_used_in_debug haveCopiedXFromGpu = false;
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
- || haveHostPmePpComms || haveHostHaloExchangeComms))
+ || haveHostPmePpComms || haveHostHaloExchangeComms || simulationWork.computeMuTot))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
haveCopiedXFromGpu = true;
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
haveCopiedXFromGpu = true;
+ if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
+ {
+ GMX_ASSERT(haveCopiedXFromGpu,
+ "a wait should only be triggered if copy has been scheduled");
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+ }
+
/* Calculate total (local) dipole moment in a temporary common array.
* This makes it possible to sum them over nodes faster.
*/
/* Calculate total (local) dipole moment in a temporary common array.
* This makes it possible to sum them over nodes faster.
*/