From 70c5d1ed664242c9f469c1884f9280f6ba299481 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Mon, 30 Sep 2019 15:04:10 +0200 Subject: [PATCH] Add management for coordinates copy events into StatePropagatorDataGpu The coordinate copies are now assign a GPU stream and fire an event when done. The consumers can now wait on coordinates to be ready on Host or get the GPU event to enqueue a wait on Device. Change-Id: Ia33e366f32d777ec980940ff7e284ab0b3498637 --- .../mdtypes/state_propagator_data_gpu.h | 52 +++++-- .../state_propagator_data_gpu_impl.cpp | 39 ++++-- .../mdtypes/state_propagator_data_gpu_impl.h | 70 ++++++++-- .../state_propagator_data_gpu_impl_gpu.cpp | 129 ++++++++++++------ 4 files changed, 210 insertions(+), 80 deletions(-) diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index ff43c4a807..a061917ca7 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -56,6 +56,8 @@ #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" +class GpuEventSynchronizer; + namespace gmx { @@ -87,14 +89,29 @@ class StatePropagatorDataGpu * PME work on the GPU, and if that rank also does PP work that is the only * rank. So all coordinates are always transferred. * - * \note \p commandStream and \p deviceContext are allowed to be nullptr if - * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME - * does not run on the GPU). + * In OpenCL, only pmeStream is used since it is the only stream created in + * PME context. The local and non-local streams are only needed when buffer + * ops are offloaded. This feature is currently not available in OpenCL and + * hence these streams are not set in these builds. + * + * \note In CUDA, the update stream is created in the constructor as a temporary + * solution, in place until the stream manager is introduced. + * Note that this makes it impossible to construct this object in CUDA + * builds executing on a host without any CUDA-capable device available. + * + * \note In CUDA, \p deviceContext is unused, hence always nullptr; + * all stream arguments can also be nullptr in runs where the + * respective streams are not required. + * In OpenCL, \p deviceContext needs to be a valid device context. + * In OpenCL runs StatePropagatorDataGpu is currently only used + * with PME offload, and only on ranks with PME duty. Hence, the + * \p pmeStream argument needs to be a valid OpenCL queue object + * which must have been created in \p deviceContext. * - * \todo A CommandStream is now visible in the CPU parts of the code so we - * can stop passing a void*. - * \todo A DeviceContext object is visible in CPU parts of the code so we - * can stop passing a void*. + * \todo Make a \p CommandStream visible in the CPU parts of the code so we + * will not have to pass a void*. + * \todo Make a \p DeviceContext object visible in CPU parts of the code so we + * will not have to pass a void*. * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. @@ -118,8 +135,11 @@ class StatePropagatorDataGpu /*! \brief Set the ranges for local and non-local atoms and reallocates buffers. * - * The coordinates buffer is reallocated with the padding added at the end. The - * size of padding is set by the constructor. + * \note + * The coordinates buffer is (re)allocated, when required by PME, with a padding, + * the size of which is set by the constructor. The padding region clearing kernel + * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME + * task uses this padding area. * * \param[in] numAtomsLocal Number of atoms in local domain. * \param[in] numAtomsAll Total number of atoms to handle. @@ -153,6 +173,14 @@ class StatePropagatorDataGpu void copyCoordinatesToGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); + /*! \brief Get the event synchronizer on the H2D coordinates copy. + * + * \param[in] atomLocality Locality of the particles to wait for. + * + * \returns The event to synchronize the stream that consumes coordinates on device. + */ + GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality); + /*! \brief Copy positions from the GPU memory. * * \param[in] h_x Positions buffer in the host memory. @@ -161,6 +189,12 @@ class StatePropagatorDataGpu void copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); + /*! \brief Wait until coordinates are available on the host. + * + * \param[in] atomLocality Locality of the particles to wait for. + */ + void waitCoordinatesReadyOnHost(AtomLocality atomLocality); + /*! \brief Get the velocities buffer on the GPU. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 6a7dd64730..7e2072f7da 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -73,86 +73,97 @@ StatePropagatorDataGpu::~StatePropagatorDataGpu() = default; void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */, int /* numAtomsAll */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return std::make_tuple(0, 0); } DeviceBuffer StatePropagatorDataGpu::getCoordinates() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return DeviceBuffer {}; } +GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); + return nullptr; +} + void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef /* h_x */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); +} + +void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef /* h_x */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } DeviceBuffer StatePropagatorDataGpu::getVelocities() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return DeviceBuffer {}; } void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef /* h_v */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef /* h_v */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } DeviceBuffer StatePropagatorDataGpu::getForces() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return DeviceBuffer {}; } void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef /* h_f */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef /* h_f */, AtomLocality /* atomLocality */) { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); } void* StatePropagatorDataGpu::getUpdateStream() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return nullptr; } int StatePropagatorDataGpu::numAtomsLocal() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return 0; } int StatePropagatorDataGpu::numAtomsAll() { - GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return 0; } diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index 0b73395af8..037eeadedf 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -45,10 +45,18 @@ #include "gmxpre.h" +#include "config.h" + #include "gromacs/gpu_utils/devicebuffer.h" +#if GMX_GPU == GMX_GPU_CUDA +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#elif GMX_GPU == GMX_GPU_OPENCL +#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h" +#endif #include "gromacs/math/vectypes.h" #include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/utility/classhelpers.h" +#include "gromacs/utility/enumerationhelpers.h" namespace gmx { @@ -71,14 +79,29 @@ class StatePropagatorDataGpu::Impl * PME work on the GPU, and if that rank also does PP work that is the only * rank. So all coordinates are always transferred. * - * \note \p commandStream and \p deviceContext are allowed to be nullptr if - * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME - * does not run on the GPU). + * In OpenCL, only pmeStream is used since it is the only stream created in + * PME context. The local and non-local streams are only needed when buffer + * ops are offloaded. This feature is currently not available in OpenCL and + * hence these streams are not set in these builds. + * + * \note In CUDA, the update stream is created in the constructor as a temporary + * solution, in place until the stream manager is introduced. + * Note that this makes it impossible to construct this object in CUDA + * builds executing on a host without any CUDA-capable device available. * - * \todo A CommandStream is now visible in the CPU parts of the code so we - * can stop passing a void*. - * \todo A DeviceContext object is visible in CPU parts of the code so we - * can stop passing a void*. + * \note In CUDA, \p deviceContext is unused, hence always nullptr; + * all stream arguments can also be nullptr in runs where the + * respective streams are not required. + * In OpenCL, \p deviceContext needs to be a valid device context. + * In OpenCL runs StatePropagatorDataGpu is currently only used + * with PME offload, and only on ranks with PME duty. Hence, the + * \p pmeStream argument needs to be a valid OpenCL queue object + * which must have been created in \p deviceContext. + * + * \todo Make a \p CommandStream visible in the CPU parts of the code so we + * will not have to pass a void*. + * \todo Make a \p DeviceContext object visible in CPU parts of the code so we + * will not have to pass a void*. * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. @@ -99,11 +122,11 @@ class StatePropagatorDataGpu::Impl /*! \brief Set the ranges for local and non-local atoms and reallocates buffers. * - * The coordinates buffer is reallocated with the padding added at the end. The - * size of padding is set by the constructor. - * - * \note The PME requires clearing of the padding, which is done in the pmeStream_. - * Hence the pmeStream_ should be created in the gpuContext_. + * \note + * The coordinates buffer is (re)allocated, when required by PME, with a padding, + * the size of which is set by the constructor. The padding region clearing kernel + * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME + * task uses this padding area. * * \param[in] numAtomsLocal Number of atoms in local domain. * \param[in] numAtomsAll Total number of atoms to handle. @@ -137,6 +160,14 @@ class StatePropagatorDataGpu::Impl void copyCoordinatesToGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); + /*! \brief Get the event synchronizer on the H2D coordinates copy. + * + * \param[in] atomLocality Locality of the particles to wait for. + * + * \returns The event to synchronize the stream that consumes coordinates on device. + */ + GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality); + /*! \brief Copy positions from the GPU memory. * * \param[in] h_x Positions buffer in the host memory. @@ -145,6 +176,12 @@ class StatePropagatorDataGpu::Impl void copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); + /*! \brief Wait until coordinates are available on the host. + * + * \param[in] atomLocality Locality of the particles to wait for. + */ + void waitCoordinatesReadyOnHost(AtomLocality atomLocality); + /*! \brief Get the velocities buffer on the GPU. * @@ -221,6 +258,15 @@ class StatePropagatorDataGpu::Impl CommandStream nonLocalStream_ = nullptr; //! GPU Update-constreaints stream. CommandStream updateStream_ = nullptr; + + // Streams to use for coordinates H2S and D2H copies (one event for each atom locality) + EnumerationArray xCopyStreams_ = {{nullptr}}; + + //! An array of events that indicate H2D copy is complete (one event for each atom locality) + EnumerationArray xReadyOnDevice_; + //! An array of events that indicate D2H copy is complete (one event for each atom locality) + EnumerationArray xReadyOnHost_; + /*! \brief GPU context (for OpenCL builds) * \todo Make a Context class usable in CPU code */ diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 3cc0bb7653..ba6850db45 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -71,33 +71,52 @@ StatePropagatorDataGpu::Impl::Impl(const void *pmeStream, transferKind_(transferKind), paddingSize_(paddingSize) { - + static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build."); - if (pmeStream != nullptr) - { - pmeStream_ = *static_cast(pmeStream); - } - if (localStream != nullptr) + // TODO: Refactor when the StreamManager is introduced. + if (GMX_GPU == GMX_GPU_OPENCL) { - localStream_ = *static_cast(localStream); + GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds."); + GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds."); + + // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context. + pmeStream_ = *static_cast(pmeStream); + updateStream_ = *static_cast(pmeStream); + deviceContext_ = *static_cast(deviceContext); + GMX_UNUSED_VALUE(localStream); + GMX_UNUSED_VALUE(nonLocalStream); } - if (nonLocalStream != nullptr) + + if (GMX_GPU == GMX_GPU_CUDA) { - nonLocalStream_ = *static_cast(nonLocalStream); - } -// The OpenCL build will never use the updateStream -// TODO: The update stream should be created only when it is needed. -#if GMX_GPU == GMX_GPU_CUDA - cudaError_t stat; - stat = cudaStreamCreate(&updateStream_); - CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); + if (pmeStream != nullptr) + { + pmeStream_ = *static_cast(pmeStream); + } + if (localStream != nullptr) + { + localStream_ = *static_cast(localStream); + } + if (nonLocalStream != nullptr) + { + nonLocalStream_ = *static_cast(nonLocalStream); + } + + // TODO: The update stream should be created only when it is needed. +#if (GMX_GPU == GMX_GPU_CUDA) + cudaError_t stat; + stat = cudaStreamCreate(&updateStream_); + CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); #endif - if (deviceContext != nullptr) - { - deviceContext_ = *static_cast(deviceContext); + GMX_UNUSED_VALUE(deviceContext); } + // Map the atom locality to the stream that will be used for coordinates transfer. + // Same streams are used for H2D and D2H copies + xCopyStreams_[AtomLocality::Local] = updateStream_; + xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_; + xCopyStreams_[AtomLocality::All] = updateStream_; } StatePropagatorDataGpu::Impl::~Impl() @@ -106,9 +125,6 @@ StatePropagatorDataGpu::Impl::~Impl() void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) { -#if GMX_GPU == GMX_GPU_OPENCL - GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds."); -#endif numAtomsLocal_ = numAtomsLocal; numAtomsAll_ = numAtomsAll; @@ -127,9 +143,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_; if (paddingAllocationSize > 0) { - // The PME stream is used here because: - // 1. The padding clearing is only needed by PME. - // 2. It is the stream that is created in the PME OpenCL context. + // The PME stream is used here because the padding region of d_x_ is only in the PME task. clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, pmeStream_); } @@ -171,14 +185,6 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer CommandStream commandStream) { -#if GMX_GPU == GMX_GPU_OPENCL - GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds."); - // The PME stream is used for OpenCL builds, because it is the context that it associated with the - // PME task which requires the coordinates managed here in OpenCL. - // TODO: This will have to be changed when the OpenCL implementation will be extended. - commandStream = pmeStream_; -#endif - GMX_UNUSED_VALUE(dataSize); GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated."); @@ -207,14 +213,6 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_da CommandStream commandStream) { -#if GMX_GPU == GMX_GPU_OPENCL - GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds."); - // The PME stream is used for OpenCL builds, because it is the context that it associated with the - // PME task which requires the coordinates managed here in OpenCL. - // TODO: This will have to be changed when the OpenCL implementation will be extended. - commandStream = pmeStream_; -#endif - GMX_UNUSED_VALUE(dataSize); GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated."); @@ -244,15 +242,46 @@ DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, AtomLocality atomLocality) { - // TODO: Use the correct stream - copyToDevice(d_x_, h_x, d_xSize_, atomLocality, nullptr); + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + CommandStream commandStream = xCopyStreams_[atomLocality]; + GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality."); + + copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream); + + // markEvent is skipped in OpenCL as: + // - it's not needed, copy is done in the same stream as the only consumer task (PME) + // - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory). + // TODO: remove this by adding an event-mark free flavor of this function + if (GMX_GPU == GMX_GPU_CUDA) + { + xReadyOnDevice_[atomLocality].markEvent(commandStream); + // TODO: Remove When event-based synchronization is introduced + gpuStreamSynchronize(commandStream); + } +} + +GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality) +{ + return &xReadyOnDevice_[atomLocality]; } void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality) { - // TODO: Use the correct stream - copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, nullptr); + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + CommandStream commandStream = xCopyStreams_[atomLocality]; + GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality."); + + copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream); + // TODO: Remove When event-based synchronization is introduced + gpuStreamSynchronize(commandStream); + // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed. + xReadyOnHost_[atomLocality].markEvent(commandStream); +} + +void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality) +{ + xReadyOnHost_[atomLocality].waitForEvent(); } @@ -356,12 +385,22 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRefcopyCoordinatesToGpu(h_x, atomLocality); } +GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality) +{ + return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality); +} + void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality) { return impl_->copyCoordinatesFromGpu(h_x, atomLocality); } +void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality) +{ + return impl_->waitCoordinatesReadyOnHost(atomLocality); +} + DeviceBuffer StatePropagatorDataGpu::getVelocities() { -- 2.22.0