With GPU update coordinates are transferred back to the CPU every step
if there are forces to compute on the CPU. Originally this was
implemented with a back-to-back transfer launch and wait at the
beginning of do_force().
This change moves the CPU wait for the completion of the coordinate
transfer closer to the consumer tasks in order to avoid blocking GPU
force tasks' launch and allowing compute and transfer to overlap.
Fixes #3221
Change-Id: Ia6641147bbec1186b54c1445d36dc31000eae9c4
const bool haveHostPmePpComms =
!thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
const bool haveHostPmePpComms =
!thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+ bool gmx_used_in_debug haveCopiedXFromGpu = false;
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
|| haveHostPmePpComms || haveHostHaloExchangeComms))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
&& (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
|| haveHostPmePpComms || haveHostHaloExchangeComms))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
- stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+ haveCopiedXFromGpu = true;
}
const auto localXReadyOnDevice = (stateGpu != nullptr)
}
const auto localXReadyOnDevice = (stateGpu != nullptr)
bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
bool sendCoordinatesFromGpu =
simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
bool sendCoordinatesFromGpu =
simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+
+ if (simulationWork.useGpuUpdate && !sendCoordinatesFromGpu)
+ {
+ GMX_RELEASE_ASSERT(false,
+ "GPU update and separate PME ranks are only supported with GPU "
+ "direct communication!");
+ // TODO: when this code-path becomes supported add:
+ // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+ }
+
gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
+ if (simulationWork.useGpuUpdate)
+ {
+ GMX_ASSERT(stateGpu, "need a valid stateGpu object");
+ GMX_ASSERT(haveCopiedXFromGpu,
+ "a wait should only be triggered if copy has been scheduled");
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+ }
nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
}
}
nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
}
}
+ // Note: GPU update + DD without direct communication is not supported,
+ // a waitCoordinatesReadyOnHost() should be issued if it will be.
+ GMX_ASSERT(!simulationWork.useGpuUpdate,
+ "GPU update is not supported with halo exchange");
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
}
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
}
dd_force_flop_start(cr->dd, nrnb);
}
dd_force_flop_start(cr->dd, nrnb);
}
+ // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
+ // this wait ensures that the D2H transfer is complete.
+ if ((simulationWork.useGpuUpdate)
+ && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+ {
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+ }
+
if (inputrec->bRot)
{
wallcycle_start(wcycle, ewcROT);
if (inputrec->bRot)
{
wallcycle_start(wcycle, ewcROT);