/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+#include <memory>
+#include <tuple>
+
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/vectypes.h"
#include "locality.h"
+class DeviceContext;
+class DeviceStream;
class GpuEventSynchronizer;
struct gmx_wallcycle;
namespace gmx
{
+class DeviceStreamManager;
class StatePropagatorDataGpu
{
* ops are offloaded. This feature is currently not available in OpenCL and
* hence these streams are not set in these builds.
*
- * \note In CUDA, the update stream is created in the constructor as a temporary
- * solution, in place until the stream manager is introduced.
- * Note that this makes it impossible to construct this object in CUDA
- * builds executing on a host without any CUDA-capable device available.
- *
- * \note In CUDA, \p deviceContext is unused, hence always nullptr;
- * all stream arguments can also be nullptr in runs where the
- * respective streams are not required.
- * In OpenCL, \p deviceContext needs to be a valid device context.
- * In OpenCL runs StatePropagatorDataGpu is currently only used
- * with PME offload, and only on ranks with PME duty. Hence, the
- * \p pmeStream argument needs to be a valid OpenCL queue object
- * which must have been created in \p deviceContext.
- *
- * \todo Make a \p CommandStream visible in the CPU parts of the code so we
- * will not have to pass a void*.
- * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
- * will not have to pass a void*.
- *
- * \param[in] pmeStream Device PME stream, nullptr allowed.
- * \param[in] localStream Device NBNXM local stream, nullptr allowed.
- * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
- * \param[in] deviceContext Device context, nullptr allowed.
- * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
- * \param[in] wcycle Wall cycle counter data.
+ * \param[in] deviceStreamManager Object that owns the DeviceContext and DeviceStreams.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] allocationBlockSizeDivisor Deterines padding size for coordinates buffer.
+ * \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
* \param[in] pmeStream Device PME stream, nullptr is not allowed.
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ StatePropagatorDataGpu(const DeviceStream* pmeStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle);
//! Move constructor
StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
*
* \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
*/
- std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+ std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
/*! \brief Get the positions buffer on the GPU.
*
* \returns GPU positions buffer.
*/
- DeviceBuffer<float> getCoordinates();
+ DeviceBuffer<RVec> getCoordinates();
/*! \brief Copy positions to the GPU memory.
+ *
+ * Use \ref getCoordinatesReadyOnDeviceEvent to get the associated event synchronizer or
+ * \ref waitCoordinatesCopiedToDevice to wait for the copy completion.
+ * Note: the event is not marked in OpenCL, because it is not used.
*
* \param[in] h_x Positions in the host memory.
* \param[in] atomLocality Locality of the particles to copy.
* steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
* returned synchronizer indicates that the copy is complete.
*
- * \param[in] atomLocality Locality of the particles to wait for.
- * \param[in] simulationWork The simulation lifetime flags.
- * \param[in] stepWork The step lifetime flags.
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \param[in] simulationWork The simulation lifetime flags.
+ * \param[in] stepWork The step lifetime flags.
+ * \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
*
* \returns The event to synchronize the stream that consumes coordinates on device.
*/
GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
const SimulationWorkload& simulationWork,
- const StepWorkload& stepWork);
+ const StepWorkload& stepWork,
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
/*! \brief Blocking wait until coordinates are copied to the device.
*
*/
void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
- /*! \brief Getter for the event synchronizer for the update is done on th GPU
+ /*! \brief Consume the event for copying coordinates to the device.
+ *
+ * Used for manual event consumption. Does nothing except changing the internal event counter.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+ /*! \brief Reset the event for copying coordinates to the device.
*
- * \returns The event to synchronize the stream coordinates wre updated on device.
+ * Used for manual event consumption. Does nothing except resetting the event.
+ *
+ * \param[in] atomLocality Locality of the particles.
+ */
+ void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+ /*! \brief Setter for the event synchronizer for the update is done on th GPU
+ *
+ * \param[in] xUpdatedOnDeviceEvent The event to synchronize the stream coordinates wre updated on device.
*/
- GpuEventSynchronizer* xUpdatedOnDevice();
+ void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
- /*! \brief Copy positions from the GPU memory.
+ /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
*
* \param[in] h_x Positions buffer in the host memory.
* \param[in] atomLocality Locality of the particles to copy.
+ * \param[in] dependency Dependency event for this operation.
*/
- void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+ void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality,
+ GpuEventSynchronizer* dependency = nullptr);
/*! \brief Wait until coordinates are available on the host.
*
*
* \returns GPU velocities buffer.
*/
- DeviceBuffer<float> getVelocities();
+ DeviceBuffer<RVec> getVelocities();
/*! \brief Copy velocities to the GPU memory.
+ *
+ * Does not mark any event, because we don't use it anywhere at the moment.
*
* \param[in] h_v Velocities in the host memory.
* \param[in] atomLocality Locality of the particles to copy.
*/
void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec> h_v, AtomLocality atomLocality);
- /*! \brief Get the event synchronizer for the H2D velocities copy.
- *
- * \param[in] atomLocality Locality of the particles to wait for.
- *
- * \returns The event to synchronize the stream that consumes velocities on device.
- */
- GpuEventSynchronizer* getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality);
-
/*! \brief Copy velocities from the GPU memory.
*
* \param[in] h_v Velocities buffer in the host memory.
*
* \returns GPU force buffer.
*/
- DeviceBuffer<float> getForces();
+ DeviceBuffer<RVec> getForces();
/*! \brief Copy forces to the GPU memory.
*
*/
void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
+ /*! \brief Clear forces in the GPU memory.
+ *
+ * \param[in] atomLocality Locality of the particles to clear.
+ * \param[in] dependency Dependency event for this operation.
+ */
+ void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
+
/*! \brief Get the event synchronizer for the forces ready on device.
*
* Returns either of the event synchronizers, depending on the offload scenario
* 1. The forces are copied to the device (when GPU buffer ops are off)
* 2. The forces are reduced on the device (GPU buffer ops are on)
*
- * \todo Pass step workload instead of the useGpuFBufferOps boolean.
- *
- * \param[in] atomLocality Locality of the particles to wait for.
- * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU.
+ * \param[in] stepWork Step workload flags
+ * \param[in] simulationWork Simulation workload flags
*
* \returns The event to synchronize the stream that consumes forces on device.
*/
- GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+ GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+ SimulationWorkload simulationWork);
/*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
*
- * \returns The event to mark when forces are reduced on the GPU.
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are reduced on the GPU.
+ */
+ GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+ /*! \brief Consume the event for when the forces are reduced on the GPU.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ */
+ void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
+ /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are ready on the GPU.
*/
- GpuEventSynchronizer* fReducedOnDevice();
+ GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
/*! \brief Copy forces from the GPU memory.
*
*
* \returns The device command stream to use in update-constraints.
*/
- void* getUpdateStream();
+ const DeviceStream* getUpdateStream();
/*! \brief Getter for the number of local atoms.
*
* \returns The number of local atoms.
*/
- int numAtomsLocal();
+ int numAtomsLocal() const;
/*! \brief Getter for the total number of atoms.
*
* \returns The total number of atoms.
*/
- int numAtomsAll();
+ int numAtomsAll() const;
private:
class Impl;
- gmx::PrivateImplPointer<Impl> impl_;
+ std::unique_ptr<Impl> impl_;
GMX_DISALLOW_COPY_AND_ASSIGN(StatePropagatorDataGpu);
};