#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
+class GpuEventSynchronizer;
+
namespace gmx
{
* PME work on the GPU, and if that rank also does PP work that is the only
* rank. So all coordinates are always transferred.
*
- * \note \p commandStream and \p deviceContext are allowed to be nullptr if
- * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
- * does not run on the GPU).
+ * In OpenCL, only pmeStream is used since it is the only stream created in
+ * PME context. The local and non-local streams are only needed when buffer
+ * ops are offloaded. This feature is currently not available in OpenCL and
+ * hence these streams are not set in these builds.
+ *
+ * \note In CUDA, the update stream is created in the constructor as a temporary
+ * solution, in place until the stream manager is introduced.
+ * Note that this makes it impossible to construct this object in CUDA
+ * builds executing on a host without any CUDA-capable device available.
+ *
+ * \note In CUDA, \p deviceContext is unused, hence always nullptr;
+ * all stream arguments can also be nullptr in runs where the
+ * respective streams are not required.
+ * In OpenCL, \p deviceContext needs to be a valid device context.
+ * In OpenCL runs StatePropagatorDataGpu is currently only used
+ * with PME offload, and only on ranks with PME duty. Hence, the
+ * \p pmeStream argument needs to be a valid OpenCL queue object
+ * which must have been created in \p deviceContext.
*
- * \todo A CommandStream is now visible in the CPU parts of the code so we
- * can stop passing a void*.
- * \todo A DeviceContext object is visible in CPU parts of the code so we
- * can stop passing a void*.
+ * \todo Make a \p CommandStream visible in the CPU parts of the code so we
+ * will not have to pass a void*.
+ * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
+ * will not have to pass a void*.
*
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
*
- * The coordinates buffer is reallocated with the padding added at the end. The
- * size of padding is set by the constructor.
+ * \note
+ * The coordinates buffer is (re)allocated, when required by PME, with a padding,
+ * the size of which is set by the constructor. The padding region clearing kernel
+ * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
+ * task uses this padding area.
*
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec> h_x,
AtomLocality atomLocality);
+ /*! \brief Get the event synchronizer on the H2D coordinates copy.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ *
+ * \returns The event to synchronize the stream that consumes coordinates on device.
+ */
+ GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Copy positions from the GPU memory.
*
* \param[in] h_x Positions buffer in the host memory.
void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
AtomLocality atomLocality);
+ /*! \brief Wait until coordinates are available on the host.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesReadyOnHost(AtomLocality atomLocality);
+
/*! \brief Get the velocities buffer on the GPU.
*
void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */,
int /* numAtomsAll */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return std::make_tuple(0, 0);
}
DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return DeviceBuffer<float> {};
}
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+ return nullptr;
+}
+
void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_x */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_x */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return DeviceBuffer<float> {};
}
void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_v */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
DeviceBuffer<float> StatePropagatorDataGpu::getForces()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return DeviceBuffer<float> {};
}
void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_f */,
AtomLocality /* atomLocality */)
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
}
void* StatePropagatorDataGpu::getUpdateStream()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return nullptr;
}
int StatePropagatorDataGpu::numAtomsLocal()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return 0;
}
int StatePropagatorDataGpu::numAtomsAll()
{
- GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
return 0;
}
#include "gmxpre.h"
+#include "config.h"
+
#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
+#endif
#include "gromacs/math/vectypes.h"
#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/enumerationhelpers.h"
namespace gmx
{
* PME work on the GPU, and if that rank also does PP work that is the only
* rank. So all coordinates are always transferred.
*
- * \note \p commandStream and \p deviceContext are allowed to be nullptr if
- * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
- * does not run on the GPU).
+ * In OpenCL, only pmeStream is used since it is the only stream created in
+ * PME context. The local and non-local streams are only needed when buffer
+ * ops are offloaded. This feature is currently not available in OpenCL and
+ * hence these streams are not set in these builds.
+ *
+ * \note In CUDA, the update stream is created in the constructor as a temporary
+ * solution, in place until the stream manager is introduced.
+ * Note that this makes it impossible to construct this object in CUDA
+ * builds executing on a host without any CUDA-capable device available.
*
- * \todo A CommandStream is now visible in the CPU parts of the code so we
- * can stop passing a void*.
- * \todo A DeviceContext object is visible in CPU parts of the code so we
- * can stop passing a void*.
+ * \note In CUDA, \p deviceContext is unused, hence always nullptr;
+ * all stream arguments can also be nullptr in runs where the
+ * respective streams are not required.
+ * In OpenCL, \p deviceContext needs to be a valid device context.
+ * In OpenCL runs StatePropagatorDataGpu is currently only used
+ * with PME offload, and only on ranks with PME duty. Hence, the
+ * \p pmeStream argument needs to be a valid OpenCL queue object
+ * which must have been created in \p deviceContext.
+ *
+ * \todo Make a \p CommandStream visible in the CPU parts of the code so we
+ * will not have to pass a void*.
+ * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
+ * will not have to pass a void*.
*
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
*
- * The coordinates buffer is reallocated with the padding added at the end. The
- * size of padding is set by the constructor.
- *
- * \note The PME requires clearing of the padding, which is done in the pmeStream_.
- * Hence the pmeStream_ should be created in the gpuContext_.
+ * \note
+ * The coordinates buffer is (re)allocated, when required by PME, with a padding,
+ * the size of which is set by the constructor. The padding region clearing kernel
+ * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
+ * task uses this padding area.
*
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec> h_x,
AtomLocality atomLocality);
+ /*! \brief Get the event synchronizer on the H2D coordinates copy.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ *
+ * \returns The event to synchronize the stream that consumes coordinates on device.
+ */
+ GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality);
+
/*! \brief Copy positions from the GPU memory.
*
* \param[in] h_x Positions buffer in the host memory.
void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
AtomLocality atomLocality);
+ /*! \brief Wait until coordinates are available on the host.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void waitCoordinatesReadyOnHost(AtomLocality atomLocality);
+
/*! \brief Get the velocities buffer on the GPU.
*
CommandStream nonLocalStream_ = nullptr;
//! GPU Update-constreaints stream.
CommandStream updateStream_ = nullptr;
+
+ // Streams to use for coordinates H2S and D2H copies (one event for each atom locality)
+ EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = {{nullptr}};
+
+ //! An array of events that indicate H2D copy is complete (one event for each atom locality)
+ EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
+ //! An array of events that indicate D2H copy is complete (one event for each atom locality)
+ EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
+
/*! \brief GPU context (for OpenCL builds)
* \todo Make a Context class usable in CPU code
*/
transferKind_(transferKind),
paddingSize_(paddingSize)
{
-
+ static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
- if (pmeStream != nullptr)
- {
- pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
- }
- if (localStream != nullptr)
+ // TODO: Refactor when the StreamManager is introduced.
+ if (GMX_GPU == GMX_GPU_OPENCL)
{
- localStream_ = *static_cast<const CommandStream*>(localStream);
+ GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
+ GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
+
+ // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
+ pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+ updateStream_ = *static_cast<const CommandStream*>(pmeStream);
+ deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+ GMX_UNUSED_VALUE(localStream);
+ GMX_UNUSED_VALUE(nonLocalStream);
}
- if (nonLocalStream != nullptr)
+
+ if (GMX_GPU == GMX_GPU_CUDA)
{
- nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
- }
-// The OpenCL build will never use the updateStream
-// TODO: The update stream should be created only when it is needed.
-#if GMX_GPU == GMX_GPU_CUDA
- cudaError_t stat;
- stat = cudaStreamCreate(&updateStream_);
- CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
+ if (pmeStream != nullptr)
+ {
+ pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+ }
+ if (localStream != nullptr)
+ {
+ localStream_ = *static_cast<const CommandStream*>(localStream);
+ }
+ if (nonLocalStream != nullptr)
+ {
+ nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
+ }
+
+ // TODO: The update stream should be created only when it is needed.
+#if (GMX_GPU == GMX_GPU_CUDA)
+ cudaError_t stat;
+ stat = cudaStreamCreate(&updateStream_);
+ CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
#endif
- if (deviceContext != nullptr)
- {
- deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+ GMX_UNUSED_VALUE(deviceContext);
}
+ // Map the atom locality to the stream that will be used for coordinates transfer.
+ // Same streams are used for H2D and D2H copies
+ xCopyStreams_[AtomLocality::Local] = updateStream_;
+ xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
+ xCopyStreams_[AtomLocality::All] = updateStream_;
}
StatePropagatorDataGpu::Impl::~Impl()
void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
{
-#if GMX_GPU == GMX_GPU_OPENCL
- GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
-#endif
numAtomsLocal_ = numAtomsLocal;
numAtomsAll_ = numAtomsAll;
const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
if (paddingAllocationSize > 0)
{
- // The PME stream is used here because:
- // 1. The padding clearing is only needed by PME.
- // 2. It is the stream that is created in the PME OpenCL context.
+ // The PME stream is used here because the padding region of d_x_ is only in the PME task.
clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, pmeStream_);
}
CommandStream commandStream)
{
-#if GMX_GPU == GMX_GPU_OPENCL
- GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
- // The PME stream is used for OpenCL builds, because it is the context that it associated with the
- // PME task which requires the coordinates managed here in OpenCL.
- // TODO: This will have to be changed when the OpenCL implementation will be extended.
- commandStream = pmeStream_;
-#endif
-
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
CommandStream commandStream)
{
-#if GMX_GPU == GMX_GPU_OPENCL
- GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
- // The PME stream is used for OpenCL builds, because it is the context that it associated with the
- // PME task which requires the coordinates managed here in OpenCL.
- // TODO: This will have to be changed when the OpenCL implementation will be extended.
- commandStream = pmeStream_;
-#endif
-
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
AtomLocality atomLocality)
{
- // TODO: Use the correct stream
- copyToDevice(d_x_, h_x, d_xSize_, atomLocality, nullptr);
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ CommandStream commandStream = xCopyStreams_[atomLocality];
+ GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality.");
+
+ copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+
+ // markEvent is skipped in OpenCL as:
+ // - it's not needed, copy is done in the same stream as the only consumer task (PME)
+ // - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
+ // TODO: remove this by adding an event-mark free flavor of this function
+ if (GMX_GPU == GMX_GPU_CUDA)
+ {
+ xReadyOnDevice_[atomLocality].markEvent(commandStream);
+ // TODO: Remove When event-based synchronization is introduced
+ gpuStreamSynchronize(commandStream);
+ }
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality)
+{
+ return &xReadyOnDevice_[atomLocality];
}
void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
AtomLocality atomLocality)
{
- // TODO: Use the correct stream
- copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, nullptr);
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ CommandStream commandStream = xCopyStreams_[atomLocality];
+ GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality.");
+
+ copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+ // TODO: Remove When event-based synchronization is introduced
+ gpuStreamSynchronize(commandStream);
+ // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
+ xReadyOnHost_[atomLocality].markEvent(commandStream);
+}
+
+void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
+{
+ xReadyOnHost_[atomLocality].waitForEvent();
}
return impl_->copyCoordinatesToGpu(h_x, atomLocality);
}
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality)
+{
+ return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality);
+}
+
void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x,
AtomLocality atomLocality)
{
return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
}
+void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
+{
+ return impl_->waitCoordinatesReadyOnHost(atomLocality);
+}
+
DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
{