changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
}
- // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds
- auto stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(commandStream, deviceContext, GpuApiCallBehavior::Sync, paddingSize);
+ std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
+ if (useGpuForPme)
+ {
+ // TODO: The local and non-local nonbonded streams are passed as nullptrs, since they will be not used for the GPU buffer
+ // management in PME only ranks. Make the constructor safer.
+ stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(commandStream, nullptr, nullptr,
+ deviceContext, GpuApiCallBehavior::Sync, paddingSize);
+ }
+
clear_nrnb(mynrnb);
(inputForceTreatment == PmeForceOutputHandling::ReduceWithInput) ? "with reduction" : "without reduction"
));
- PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box);
- StatePropagatorDataGpu stateGpu = makeStatePropagatorDataGpu(*pmeSafe.get());
- pmeInitAtoms(pmeSafe.get(), &stateGpu, codePath, inputAtomData.coordinates, inputAtomData.charges);
+ PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box);
+ std::unique_ptr<StatePropagatorDataGpu> stateGpu = (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr;
+
+ pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates, inputAtomData.charges);
/* Setting some more inputs */
pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
- std::shared_ptr<StatePropagatorDataGpu> stateGpu;
CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
/* Running the test */
- PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box);
- StatePropagatorDataGpu stateGpu = makeStatePropagatorDataGpu(*pmeSafe.get());
- pmeInitAtoms(pmeSafe.get(), &stateGpu, codePath, coordinates, charges);
+ PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box);
+ std::unique_ptr<StatePropagatorDataGpu> stateGpu = (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr;
+
+ pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
}
//! Make a GPU state-propagator manager
-StatePropagatorDataGpu
+std::unique_ptr<StatePropagatorDataGpu>
makeStatePropagatorDataGpu(const gmx_pme_t &pme)
{
// TODO: Pin the host buffer and use async memory copies
- return StatePropagatorDataGpu(pme_gpu_get_device_stream(&pme),
- pme_gpu_get_device_context(&pme),
- GpuApiCallBehavior::Sync,
- pme_gpu_get_padding_size(&pme));
+ return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), nullptr, nullptr,
+ pme_gpu_get_device_context(&pme),
+ GpuApiCallBehavior::Sync,
+ pme_gpu_get_padding_size(&pme));
}
//! PME initialization with atom data
const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}},
real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F);
//! Make a GPU state-propagator manager
-StatePropagatorDataGpu
+std::unique_ptr<StatePropagatorDataGpu>
makeStatePropagatorDataGpu(const gmx_pme_t &pme);
//! PME initialization with atom data and system box
void pmeInitAtoms(gmx_pme_t *pme,
*/
void setPbc(const t_pbc *pbc);
+ /*! \brief Synchronize the device stream.
+ */
+ void synchronizeStream();
private:
class Impl;
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
}
+void UpdateConstrainCuda::synchronizeStream()
+{
+ GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
+}
+
} // namespace gmx
#endif /* GMX_GPU != GMX_GPU_CUDA */
settleCuda_->setPbc(pbc);
}
+void UpdateConstrainCuda::Impl::synchronizeStream()
+{
+ gpuStreamSynchronize(commandStream_);
+}
+
UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir,
const gmx_mtop_t &mtop,
const void *commandStream)
impl_->setPbc(pbc);
}
+void UpdateConstrainCuda::synchronizeStream()
+{
+ impl_->synchronizeStream();
+}
+
} //namespace gmx
*/
void setPbc(const t_pbc *pbc);
+ /*! \brief Synchronize the device stream.
+ */
+ void synchronizeStream();
+
private:
//! CUDA stream
GMX_LOG(mdlog.info).asParagraph().
appendText("Updating coordinates on the GPU.");
}
- integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, nullptr);
+ integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream());
}
if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
doPressureCouple, ir->nstpcouple*ir->delta_t, M);
stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
- stateGpu->synchronizeStream();
+ // Synchronize the update stream.
+ // TODO: Replace with event-based synchronization.
+ integrator->synchronizeStream();
}
else
{
fcd->orires.nr != 0,
fcd->disres.nsystems != 0);
- const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr;
- const void *deviceContext = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr;
- const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
-
- const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
+ const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
false,
inputrec, doRerun, vsite.get(), ms, replExParams,
fcd, static_cast<int>(filenames.size()), filenames.data(),
&observablesHistory, membed);
- const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
- GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
+ const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
- // We initialize GPU state even for the CPU runs so we will have a more verbose
- // error if someone will try accessing it from the CPU codepath
- gmx::StatePropagatorDataGpu stateGpu(commandStream,
- deviceContext,
- transferKind,
- paddingSize);
- fr->stateGpu = &stateGpu;
+ std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
+ if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || useGpuForUpdate))
+ {
+ const void *pmeStream = pme_gpu_get_device_stream(fr->pmedata);
+ const void *localStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local) : nullptr;
+ const void *nonLocalStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) : nullptr;
+ const void *deviceContext = pme_gpu_get_device_context(fr->pmedata);
+ const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
+ GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
+
+ stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(pmeStream,
+ localStream,
+ nonLocalStream,
+ deviceContext,
+ transferKind,
+ paddingSize);
+ fr->stateGpu = stateGpu.get();
+ }
// TODO This is not the right place to manage the lifetime of
// this data structure, but currently it's the easiest way to
* \todo A DeviceContext object is visible in CPU parts of the code so we
* can stop passing a void*.
*
- * \param[in] commandStream GPU stream, nullptr allowed.
- * \param[in] deviceContext GPU context, nullptr allowed.
- * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
- */
- StatePropagatorDataGpu(const void *commandStream,
+ * \param[in] pmeStream Device PME stream, nullptr allowed.
+ * \param[in] localStream Device NBNXM local stream, nullptr allowed.
+ * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
+ * \param[in] deviceContext Device context, nullptr allowed.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] paddingSize Padding size for coordinates buffer.
+ */
+ StatePropagatorDataGpu(const void *pmeStream,
+ const void *localStream,
+ const void *nonLocalStream,
const void *deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize);
*/
void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
AtomLocality atomLocality);
- /*! \brief Synchronize the underlying GPU stream
+
+ /*! \brief Getter for the update stream.
+ *
+ * \todo This is temporary here, until the management of this stream is taken over.
+ *
+ * \returns The device command stream to use in update-constraints.
*/
- void synchronizeStream();
+ void* getUpdateStream();
/*! \brief Getter for the number of local atoms.
*
{
};
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* commandStream */,
- const void * /* deviceContext */,
- GpuApiCallBehavior /* transferKind */,
- int /* paddingSize */)
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* pmeStream */,
+ const void * /* localStream */,
+ const void * /* nonLocalStream */,
+ const void * /* deviceContext */,
+ GpuApiCallBehavior /* transferKind */,
+ int /* paddingSize */)
: impl_(nullptr)
{
}
GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
}
-void StatePropagatorDataGpu::synchronizeStream()
+void* StatePropagatorDataGpu::getUpdateStream()
{
GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return nullptr;
}
int StatePropagatorDataGpu::numAtomsLocal()
* \todo A DeviceContext object is visible in CPU parts of the code so we
* can stop passing a void*.
*
- * \param[in] commandStream GPU stream, nullptr allowed.
- * \param[in] deviceContext GPU context, nullptr allowed.
- * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] pmeStream Device PME stream, nullptr allowed.
+ * \param[in] localStream Device NBNXM local stream, nullptr allowed.
+ * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
+ * \param[in] deviceContext Device context, nullptr allowed.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] paddingSize Padding size for coordinates buffer.
*/
- Impl(const void *commandStream,
+ Impl(const void *pmeStream,
+ const void *localStream,
+ const void *nonLocalStream,
const void *deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize);
* The coordinates buffer is reallocated with the padding added at the end. The
* size of padding is set by the constructor.
*
+ * \note The PME requires clearing of the padding, which is done in the pmeStream_.
+ * Hence the pmeStream_ should be created in the gpuContext_.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
AtomLocality atomLocality);
- /*! \brief Synchronize the underlying GPU stream
+ /*! \brief Getter for the update stream.
+ *
+ * \todo This is temporary here, until the management of this stream is taken over.
+ *
+ * \returns The device command stream to use in update-constraints.
*/
- void synchronizeStream();
+ void* getUpdateStream();
/*! \brief Getter for the number of local atoms.
*
private:
- /*! \brief GPU stream.
- * \todo The stream should be set to non-nullptr once the synchronization points are restored
- */
- CommandStream commandStream_ = nullptr;
+ //! GPU PME stream.
+ CommandStream pmeStream_ = nullptr;
+ //! GPU NBNXM local stream.
+ CommandStream localStream_ = nullptr;
+ //! GPU NBNXM non-local stream
+ CommandStream nonLocalStream_ = nullptr;
+ //! GPU Update-constreaints stream.
+ CommandStream updateStream_ = nullptr;
/*! \brief GPU context (for OpenCL builds)
* \todo Make a Context class usable in CPU code
*/
*
* \todo Template on locality.
*
- * \param[in,out] d_data Device-side buffer.
- * \param[in,out] h_data Host-side buffer.
- * \param[in] dataSize Device-side data allocation size.
- * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ * \param[out] d_data Device-side buffer.
+ * \param[in] h_data Host-side buffer.
+ * \param[in] dataSize Device-side data allocation size.
+ * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ * \param[in] commandStream GPU stream to execute copy in.
*/
- void copyToDevice(DeviceBuffer<float> d_data,
- gmx::ArrayRef<const gmx::RVec> h_data,
- int dataSize,
- AtomLocality atomLocality);
+ void copyToDevice(DeviceBuffer<float> d_data,
+ const gmx::ArrayRef<const gmx::RVec> h_data,
+ int dataSize,
+ AtomLocality atomLocality,
+ CommandStream commandStream);
/*! \brief Performs the copy of data from device to host buffer.
*
- * \param[in,out] h_data Host-side buffer.
- * \param[in,out] d_data Device-side buffer.
- * \param[in] dataSize Device-side data allocation size.
- * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ * \param[out] h_data Host-side buffer.
+ * \param[in] d_data Device-side buffer.
+ * \param[in] dataSize Device-side data allocation size.
+ * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ * \param[in] commandStream GPU stream to execute copy in.
*/
void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
DeviceBuffer<float> d_data,
int dataSize,
- AtomLocality atomLocality);
+ AtomLocality atomLocality,
+ CommandStream commandStream);
};
} // namespace gmx
namespace gmx
{
-StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream,
- gmx_unused const void *deviceContext,
+StatePropagatorDataGpu::Impl::Impl(const void *pmeStream,
+ const void *localStream,
+ const void *nonLocalStream,
+ const void *deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize) :
transferKind_(transferKind),
GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
- // Set the stream-context pair for the OpenCL builds,
- // use the nullptr stream for CUDA builds
-#if GMX_GPU == GMX_GPU_OPENCL
- if (commandStream != nullptr)
+ if (pmeStream != nullptr)
+ {
+ pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+ }
+ if (localStream != nullptr)
+ {
+ localStream_ = *static_cast<const CommandStream*>(localStream);
+ }
+ if (nonLocalStream != nullptr)
{
- commandStream_ = *static_cast<const CommandStream*>(commandStream);
+ nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
}
+// The OpenCL build will never use the updateStream
+// TODO: The update stream should be created only when it is needed.
+#if GMX_GPU == GMX_GPU_CUDA
+ cudaError_t stat;
+ stat = cudaStreamCreate(&updateStream_);
+ CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
+#endif
if (deviceContext != nullptr)
{
deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
}
-#endif
}
const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
if (paddingAllocationSize > 0)
{
- clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_);
+ // The PME stream is used here because:
+ // 1. The padding clearing is only needed by PME.
+ // 2. It is the stream that is created in the PME OpenCL context.
+ clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, pmeStream_);
}
reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float> d_data,
const gmx::ArrayRef<const gmx::RVec> h_data,
int dataSize,
- AtomLocality atomLocality)
+ AtomLocality atomLocality,
+ CommandStream commandStream)
{
#if GMX_GPU == GMX_GPU_OPENCL
GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+ // The PME stream is used for OpenCL builds, because it is the context that it associated with the
+ // PME task which requires the coordinates managed here in OpenCL.
+ // TODO: This will have to be changed when the OpenCL implementation will be extended.
+ commandStream = pmeStream_;
#endif
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
- // TODO: Use the proper stream
copyToDeviceBuffer(&d_data, reinterpret_cast<const float *>(&h_data.data()[atomsStartAt]),
elementsStartAt, numElementsToCopy,
- commandStream_, transferKind_, nullptr);
+ commandStream, transferKind_, nullptr);
}
}
void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
DeviceBuffer<float> d_data,
int dataSize,
- AtomLocality atomLocality)
+ AtomLocality atomLocality,
+ CommandStream commandStream)
{
#if GMX_GPU == GMX_GPU_OPENCL
GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+ // The PME stream is used for OpenCL builds, because it is the context that it associated with the
+ // PME task which requires the coordinates managed here in OpenCL.
+ // TODO: This will have to be changed when the OpenCL implementation will be extended.
+ commandStream = pmeStream_;
#endif
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
- // TODO: Use the proper stream
copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
elementsStartAt, numElementsToCopy,
- commandStream_, transferKind_, nullptr);
-
+ commandStream, transferKind_, nullptr);
}
}
void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
AtomLocality atomLocality)
{
- copyToDevice(d_x_, h_x, d_xSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyToDevice(d_x_, h_x, d_xSize_, atomLocality, nullptr);
}
void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
AtomLocality atomLocality)
{
- copyFromDevice(h_x, d_x_, d_xSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, nullptr);
}
void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
AtomLocality atomLocality)
{
- copyToDevice(d_v_, h_v, d_vSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyToDevice(d_v_, h_v, d_vSize_, atomLocality, nullptr);
}
void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v,
AtomLocality atomLocality)
{
- copyFromDevice(h_v, d_v_, d_vSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, nullptr);
}
void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
AtomLocality atomLocality)
{
- copyToDevice(d_f_, h_f, d_fSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyToDevice(d_f_, h_f, d_fSize_, atomLocality, nullptr);
}
void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
AtomLocality atomLocality)
{
- copyFromDevice(h_f, d_f_, d_fSize_, atomLocality);
+ // TODO: Use the correct stream
+ copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, nullptr);
}
-void StatePropagatorDataGpu::Impl::synchronizeStream()
+void* StatePropagatorDataGpu::Impl::getUpdateStream()
{
- gpuStreamSynchronize(commandStream_);
+ return updateStream_;
}
int StatePropagatorDataGpu::Impl::numAtomsLocal()
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void *commandStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void *pmeStream,
+ const void *localStream,
+ const void *nonLocalStream,
const void *deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize)
- : impl_(new Impl(commandStream,
+ : impl_(new Impl(pmeStream,
+ localStream,
+ nonLocalStream,
deviceContext,
transferKind,
paddingSize))
return impl_->copyForcesFromGpu(h_f, atomLocality);
}
-void StatePropagatorDataGpu::synchronizeStream()
+void* StatePropagatorDataGpu::getUpdateStream()
{
- return impl_->synchronizeStream();
+ return impl_->getUpdateStream();
}
int StatePropagatorDataGpu::numAtomsLocal()