The H2D copies are only needed:
1. When update is not ofloaded.
2. At the search steps, after device buffers were reinitialized.
The D2H copies are only needed:
1. On the search steps, since the device buffers are reinitialized.
2. If there are CPU consumers, e.g. CPU bondeds.
3. When the energy is computed.
4. When coordinates are needed for output.
There are two special cases, when coordinates are needed on host,
that dealt with separately:
1. When the PME it tuned.
2. When center of mass motion is removed.
The locality of copied atoms when update is offloaded is changed
from All to Local in preparation for multi-GPU case. The blocking sync
on H2D copy event is moved from UpdateConstraints to
StatePropagatorDataGpu.
Change-Id: I971a6273b39fa7da07600312c085ce343b5d25ee
pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
}
}
pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
}
}
- stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ // We need to copy coordinates when:
+ // 1. Update is not offloaded
+ // 2. The buffers were reinitialized on search step
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
+ {
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ }
}
const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
}
const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
// NS step is also a virial step (on which f buf ops are deactivated).
if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
{
// NS step is also a virial step (on which f buf ops are deactivated).
if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
{
- GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
}
}
nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
}
}
{
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
{
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
stateGpu->getCoordinates(),
localXReadyOnDevice);
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
stateGpu->getCoordinates(),
localXReadyOnDevice);
*/
void setPbc(const t_pbc *pbc);
*/
void setPbc(const t_pbc *pbc);
- /*! \brief Blocking wait on the update of coordinates being ready.
- *
- * \todo Remove when the "stitching" is done.
- */
- void waitCoordinatesReadyOnDevice();
-
-
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
settleCuda_->setPbc(pbc);
}
settleCuda_->setPbc(pbc);
}
-void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
-{
- coordinatesReady_->waitForEvent();
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
{
return coordinatesReady_;
GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
{
return coordinatesReady_;
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
- impl_->waitCoordinatesReadyOnDevice();
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
return impl_->getCoordinatesReadySync();
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
return impl_->getCoordinatesReadySync();
*/
void setPbc(const t_pbc *pbc);
*/
void setPbc(const t_pbc *pbc);
- /*! \brief Blocking wait on the update of coordinates being ready.
- *
- * \todo Remove when the "stitching" is done.
- */
- void waitCoordinatesReadyOnDevice();
-
-
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
if (bPMETune && bNStList)
{
if (bPMETune && bNStList)
{
+ // This has to be here because PME load balancing is called so early.
+ // TODO: Move to after all booleans are defined.
+ if (useGpuForUpdate && !bFirstStep)
+ {
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ }
/* PME grid + cut-off optimization with GPUs or PME nodes */
pme_loadbal_do(pme_loadbal, cr,
(mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
/* PME grid + cut-off optimization with GPUs or PME nodes */
pme_loadbal_do(pme_loadbal, cr,
(mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
do_verbose = mdrunOptions.verbose &&
(step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
do_verbose = mdrunOptions.verbose &&
(step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
- // Copy velocities from the GPU when needed:
- // - On search steps to keep copy on host (device buffers are reinitialized).
- // - When needed for the output.
if (useGpuForUpdate && !bFirstStep)
{
if (useGpuForUpdate && !bFirstStep)
{
+ // Copy velocities from the GPU when needed:
+ // - On search steps to keep copy on host (device buffers are reinitialized).
+ // - When needed for the output.
if (bNS || do_per_step(step, ir->nstvout))
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
}
if (bNS || do_per_step(step, ir->nstvout))
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
}
+ // Copy coordinate from the GPU when needed:
+ // - On search steps to keep copy on host (device buffers are reinitialized).
+ // - There are CPU bonded forces that need current coordinates
+ // - When needed for the output.
+ if (bNS ||
+ (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
+ do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+ {
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ }
+ }
if (bNS && !(bFirstStep && ir->bContinuation))
{
if (bNS && !(bFirstStep && ir->bContinuation))
{
// Copy data to the GPU after buffers might have being reinitialized
stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
// Copy data to the GPU after buffers might have being reinitialized
stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
- stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
// TODO: Use StepWorkload fields.
stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
// TODO: Use StepWorkload fields.
ir->delta_t, true, bCalcVir, shake_vir,
doTempCouple, ekind->tcstat,
doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
ir->delta_t, true, bCalcVir, shake_vir,
doTempCouple, ekind->tcstat,
doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
- stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
// Copy velocities D2H after update if:
// - Globals are computed this step (includes the energy output steps).
// Copy velocities D2H after update if:
// - Globals are computed this step (includes the energy output steps).
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
- // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
- integrator->waitCoordinatesReadyOnDevice();
{
process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
{
process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
+
+ // TODO: The special case of removing CM motion should be dealt more gracefully
+ if (useGpuForUpdate)
+ {
+ stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesCopiedToDevice(StatePropagatorDataGpu::AtomLocality::Local);
+ }
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
+ /*! \brief Blocking wait until coordinates are copied to the device.
+ *
+ * Synchronizes the stream in which the copy was executed.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
+ /*! \brief Blocking wait until coordinates are copied to the device.
+ *
+ * Synchronizes the stream in which the copy was executed.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
+void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
+{
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ xReadyOnDevice_[atomLocality].waitForEvent();
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
{
return &xUpdatedOnDevice_;
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
{
return &xUpdatedOnDevice_;
return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
}
return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
}
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
+{
+ return impl_->waitCoordinatesCopiedToDevice(atomLocality);
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
return impl_->xUpdatedOnDevice();
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
return impl_->xUpdatedOnDevice();