pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
}
}
- stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ // We need to copy coordinates when:
+ // 1. Update is not offloaded
+ // 2. The buffers were reinitialized on search step
+ if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
+ {
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ }
}
const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
// NS step is also a virial step (on which f buf ops are deactivated).
if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
{
- GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
}
}
{
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
+ GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
stateGpu->getCoordinates(),
localXReadyOnDevice);
*/
void setPbc(const t_pbc *pbc);
- /*! \brief Blocking wait on the update of coordinates being ready.
- *
- * \todo Remove when the "stitching" is done.
- */
- void waitCoordinatesReadyOnDevice();
-
-
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
settleCuda_->setPbc(pbc);
}
-void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
-{
- coordinatesReady_->waitForEvent();
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
{
return coordinatesReady_;
impl_->setPbc(pbc);
}
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
- impl_->waitCoordinatesReadyOnDevice();
-}
-
GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
{
return impl_->getCoordinatesReadySync();
*/
void setPbc(const t_pbc *pbc);
- /*! \brief Blocking wait on the update of coordinates being ready.
- *
- * \todo Remove when the "stitching" is done.
- */
- void waitCoordinatesReadyOnDevice();
-
-
/*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
*/
GpuEventSynchronizer* getCoordinatesReadySync();
if (bPMETune && bNStList)
{
+ // This has to be here because PME load balancing is called so early.
+ // TODO: Move to after all booleans are defined.
+ if (useGpuForUpdate && !bFirstStep)
+ {
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ }
/* PME grid + cut-off optimization with GPUs or PME nodes */
pme_loadbal_do(pme_loadbal, cr,
(mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
do_verbose = mdrunOptions.verbose &&
(step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
- // Copy velocities from the GPU when needed:
- // - On search steps to keep copy on host (device buffers are reinitialized).
- // - When needed for the output.
if (useGpuForUpdate && !bFirstStep)
{
+ // Copy velocities from the GPU when needed:
+ // - On search steps to keep copy on host (device buffers are reinitialized).
+ // - When needed for the output.
if (bNS || do_per_step(step, ir->nstvout))
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
}
- }
+ // Copy coordinate from the GPU when needed:
+ // - On search steps to keep copy on host (device buffers are reinitialized).
+ // - There are CPU bonded forces that need current coordinates
+ // - When needed for the output.
+ if (bNS ||
+ (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
+ do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+ {
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ }
+ }
if (bNS && !(bFirstStep && ir->bContinuation))
{
// Copy data to the GPU after buffers might have being reinitialized
stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
}
- stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
// TODO: Use StepWorkload fields.
ir->delta_t, true, bCalcVir, shake_vir,
doTempCouple, ekind->tcstat,
doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
- stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
// Copy velocities D2H after update if:
// - Globals are computed this step (includes the energy output steps).
{
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
}
- // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
- integrator->waitCoordinatesReadyOnDevice();
}
else
{
{
process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
+
+ // TODO: The special case of removing CM motion should be dealt more gracefully
+ if (useGpuForUpdate)
+ {
+ stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+ stateGpu->waitCoordinatesCopiedToDevice(StatePropagatorDataGpu::AtomLocality::Local);
+ }
}
}
}
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
+ /*! \brief Blocking wait until coordinates are copied to the device.
+ *
+ * Synchronizes the stream in which the copy was executed.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
return nullptr;
}
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
const SimulationWorkload &simulationWork,
const StepWorkload &stepWork);
+ /*! \brief Blocking wait until coordinates are copied to the device.
+ *
+ * Synchronizes the stream in which the copy was executed.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+
/*! \brief Getter for the event synchronizer for the update is done on th GPU
*
* \returns The event to synchronize the stream coordinates wre updated on device.
}
}
+void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
+{
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ xReadyOnDevice_[atomLocality].waitForEvent();
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
{
return &xUpdatedOnDevice_;
return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
}
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
+{
+ return impl_->waitCoordinatesCopiedToDevice(atomLocality);
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
{
return impl_->xUpdatedOnDevice();