Unify coordinate copy handling across GPU platforms

[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu_impl.h
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index b5fcffeb19ffcc429a99a5057fce781c14786747..ec73a14d7a7ba4cd88c713d38a36675b3da5f537 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -47,357 +47,426 @@
  
  #include "config.h"
  
+#include <memory>
+
  #include "gromacs/gpu_utils/devicebuffer.h"
-#if GMX_GPU == GMX_GPU_CUDA
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#elif GMX_GPU == GMX_GPU_OPENCL
-#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
-#endif
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/mdtypes/state_propagator_data_gpu.h"
-#include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/enumerationhelpers.h"
  
+struct gmx_wallcycle;
+
  namespace gmx
  {
  
  class StatePropagatorDataGpu::Impl
  {
-    public:
-
-        Impl();
-
-
-        /*! \brief Constructor
-         *
-         * The buffers are reallocated only at the reinit call, the padding is
-         * used there for the coordinates buffer. It is needed for PME and added at
-         * the end of the buffer. It is assumed that if the rank has PME duties on the
-         * GPU, all coordinates are copied to the GPU and hence, for this rank, the
-         * coordinates buffer is not split into local and non-local ranges. For other
-         * ranks, the padding size is zero. This works because only one rank ever does
-         * PME work on the GPU, and if that rank also does PP work that is the only
-         * rank. So all coordinates are always transferred.
-         *
-         * In OpenCL, only pmeStream is used since it is the only stream created in
-         * PME context. The local and non-local streams are only needed when buffer
-         * ops are offloaded. This feature is currently not available in OpenCL and
-         * hence these streams are not set in these builds.
-         *
-         * \note In CUDA, the update stream is created in the constructor as a temporary
-         *       solution, in place until the stream manager is introduced.
-         *       Note that this makes it impossible to construct this object in CUDA
-         *       builds executing on a host without any CUDA-capable device available.
-         *
-         * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-         *       all stream arguments can also be nullptr in runs where the
-         *       respective streams are not required.
-         *       In OpenCL, \p deviceContext needs to be a valid device context.
-         *       In OpenCL runs StatePropagatorDataGpu is currently only used
-         *       with PME offload, and only on ranks with PME duty. Hence, the
-         *       \p pmeStream argument needs to be a valid OpenCL queue object
-         *       which must have been created in \p deviceContext.
-         *
-         * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-         *       will not have to pass a void*.
-         * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-         *       will not have to pass a void*.
-         *
-         *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-         *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-         *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-         *  \param[in] deviceContext   Device context, nullptr allowed.
-         *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-         *  \param[in] paddingSize     Padding size for coordinates buffer.
-         */
-        Impl(const void        *pmeStream,
-             const void        *localStream,
-             const void        *nonLocalStream,
-             const void        *deviceContext,
-             GpuApiCallBehavior transferKind,
-             int                paddingSize);
-
-        ~Impl();
-
-
-        /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
-         *
-         * \note
-         * The coordinates buffer is (re)allocated, when required by PME, with a padding,
-         * the size of which is set by the constructor. The padding region clearing kernel
-         * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
-         * task uses this padding area.
-         *
-         *  \param[in] numAtomsLocal  Number of atoms in local domain.
-         *  \param[in] numAtomsAll    Total number of atoms to handle.
-         */
-        void reinit(int numAtomsLocal, int numAtomsAll);
-
-        /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
-         *
-         * \todo There are at least three versions of the function with this functionality in the code:
-         *       this one and two more in NBNXM. These should be unified in a shape of a general function
-         *       in DD.
-         *
-         * \param[in]  atomLocality    If all, local or non-local ranges are needed.
-         *
-         * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
-         */
-        std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality  atomLocality);
-
-
-        /*! \brief Get the positions buffer on the GPU.
-         *
-         *  \returns GPU positions buffer.
-         */
-        DeviceBuffer<float> getCoordinates();
-
-        /*! \brief Copy positions to the GPU memory.
-         *
-         *  \param[in] h_x           Positions in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
-                                  AtomLocality                    atomLocality);
-
-        /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
-         *
-         * Returns the event synchronizer which indicates that the coordinates are ready for the
-         * consumption on the device. Takes into account that the producer may be different.
-         *
-         * If the update is offloaded, and the current step is not a DD/search step, the returned
-         * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
-         * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
-         * returned synchronizer indicates that the copy is complete.
-         *
-         *  \param[in] atomLocality    Locality of the particles to wait for.
-         *  \param[in] simulationWork  The simulation lifetime flags.
-         *  \param[in] stepWork        The step lifetime flags.
-         *
-         *  \returns  The event to synchronize the stream that consumes coordinates on device.
-         */
-        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
-                                                               const SimulationWorkload &simulationWork,
-                                                               const StepWorkload       &stepWork);
-
-        /*! \brief Getter for the event synchronizer for the update is done on th GPU
-         *
-         *  \returns  The event to synchronize the stream coordinates wre updated on device.
-         */
-        GpuEventSynchronizer* xUpdatedOnDevice();
-
-        /*! \brief Copy positions from the GPU memory.
-         *
-         *  \param[in] h_x           Positions buffer in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
-                                    AtomLocality              atomLocality);
-
-        /*! \brief Wait until coordinates are available on the host.
-         *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
-         */
-        void waitCoordinatesReadyOnHost(AtomLocality  atomLocality);
-
-
-        /*! \brief Get the velocities buffer on the GPU.
-         *
-         *  \returns GPU velocities buffer.
-         */
-        DeviceBuffer<float> getVelocities();
-
-        /*! \brief Copy velocities to the GPU memory.
-         *
-         *  \param[in] h_v           Velocities in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
-                                 AtomLocality                    atomLocality);
-
-        /*! \brief Get the event synchronizer on the H2D velocities copy.
-         *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
-         *
-         *  \returns  The event to synchronize the stream that consumes velocities on device.
-         */
-        GpuEventSynchronizer* getVelocitiesReadyOnDeviceEvent(AtomLocality  atomLocality);
-
-        /*! \brief Copy velocities from the GPU memory.
-         *
-         *  \param[in] h_v           Velocities buffer in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
-                                   AtomLocality              atomLocality);
-
-        /*! \brief Wait until velocities are available on the host.
-         *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
-         */
-        void waitVelocitiesReadyOnHost(AtomLocality  atomLocality);
-
-
-        /*! \brief Get the force buffer on the GPU.
-         *
-         *  \returns GPU force buffer.
-         */
-        DeviceBuffer<float> getForces();
-
-        /*! \brief Copy forces to the GPU memory.
-         *
-         *  \param[in] h_f           Forces in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
-                             AtomLocality                    atomLocality);
-
-        /*! \brief Get the event synchronizer on the H2D forces copy.
-         *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
-         *
-         *  \returns  The event to synchronize the stream that consumes forces on device.
-         */
-        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
-
-        /*! \brief Copy forces from the GPU memory.
-         *
-         *  \param[in] h_f           Forces buffer in the host memory.
-         *  \param[in] atomLocality  Locality of the particles to copy.
-         */
-        void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
-                               AtomLocality              atomLocality);
-
-        /*! \brief Wait until forces are available on the host.
-         *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
-         */
-        void waitForcesReadyOnHost(AtomLocality  atomLocality);
-
-        /*! \brief Getter for the update stream.
-         *
-         *  \todo This is temporary here, until the management of this stream is taken over.
-         *
-         *  \returns The device command stream to use in update-constraints.
-         */
-        void* getUpdateStream();
-
-        /*! \brief Getter for the number of local atoms.
-         *
-         *  \returns The number of local atoms.
-         */
-        int numAtomsLocal();
-
-        /*! \brief Getter for the total number of atoms.
-         *
-         *  \returns The total number of atoms.
-         */
-        int numAtomsAll();
-
-    private:
-
-        //! GPU PME stream.
-        CommandStream        pmeStream_                  = nullptr;
-        //! GPU NBNXM local stream.
-        CommandStream        localStream_                = nullptr;
-        //! GPU NBNXM non-local stream
-        CommandStream        nonLocalStream_             = nullptr;
-        //! GPU Update-constreaints stream.
-        CommandStream        updateStream_               = nullptr;
-
-        // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
-        EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = {{nullptr}};
-        // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
-        EnumerationArray<AtomLocality, CommandStream> vCopyStreams_ = {{nullptr}};
-        // Streams to use for forces H2D and D2H copies (one event for each atom locality)
-        EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = {{nullptr}};
-
-        /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
-         *
-         * \todo Reconsider naming. It should be xCopiedToDevice or xH2DCopyComplete, etc.
-         */
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
-        //! An event that the coordinates are ready after update-constraints execution
-        GpuEventSynchronizer                                 xUpdatedOnDevice_;
-        //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality)
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
-
-        //! An array of events that indicate H2D copy of velocities is complete (one event for each atom locality)
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> vReadyOnDevice_;
-        //! An array of events that indicate D2H copy of velocities is complete (one event for each atom locality)
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> vReadyOnHost_;
-
-        //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
-        //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
-        EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
-
-        /*! \brief GPU context (for OpenCL builds)
-         * \todo Make a Context class usable in CPU code
-         */
-        DeviceContext        deviceContext_              = nullptr;
-        //! Default GPU calls behavior
-        GpuApiCallBehavior   transferKind_               = GpuApiCallBehavior::Async;
-        //! Padding size for the coordinates buffer
-        int                  paddingSize_                = 0;
-
-        //! Number of local atoms
-        int                  numAtomsLocal_              = -1;
-        //! Total number of atoms
-        int                  numAtomsAll_                = -1;
-
-        //! Device positions buffer
-        DeviceBuffer<float>  d_x_;
-        //! Number of particles saved in the positions buffer
-        int                  d_xSize_                    = -1;
-        //! Allocation size for the positions buffer
-        int                  d_xCapacity_                = -1;
-
-        //! Device velocities buffer
-        DeviceBuffer<float>  d_v_;
-        //! Number of particles saved in the velocities buffer
-        int                  d_vSize_                    = -1;
-        //! Allocation size for the velocities buffer
-        int                  d_vCapacity_                = -1;
-
-        //! Device force buffer
-        DeviceBuffer<float>  d_f_;
-        //! Number of particles saved in the force buffer
-        int                  d_fSize_                    = -1;
-        //! Allocation size for the force buffer
-        int                  d_fCapacity_                = -1;
-
-        /*! \brief Performs the copy of data from host to device buffer.
-         *
-         * \todo Template on locality.
-         *
-         *  \param[out] d_data         Device-side buffer.
-         *  \param[in]  h_data         Host-side buffer.
-         *  \param[in]  dataSize       Device-side data allocation size.
-         *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-         *  \param[in]  commandStream  GPU stream to execute copy in.
-         */
-        void copyToDevice(DeviceBuffer<float>                   d_data,
-                          gmx::ArrayRef<const gmx::RVec>        h_data,
-                          int                                   dataSize,
-                          AtomLocality                          atomLocality,
-                          CommandStream                         commandStream);
-
-        /*! \brief Performs the copy of data from device to host buffer.
-         *
-         *  \param[out] h_data         Host-side buffer.
-         *  \param[in]  d_data         Device-side buffer.
-         *  \param[in]  dataSize       Device-side data allocation size.
-         *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
-         *  \param[in]  commandStream  GPU stream to execute copy in.
-         */
-        void copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_data,
-                            DeviceBuffer<float>       d_data,
-                            int                       dataSize,
-                            AtomLocality              atomLocality,
-                            CommandStream             commandStream);
+public:
+    Impl();
+
+
+    /*! \brief Constructor
+     *
+     * The buffers are reallocated only at the reinit call, the padding is
+     * used there for the coordinates buffer. It is needed for PME and added at
+     * the end of the buffer. It is assumed that if the rank has PME duties on the
+     * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+     * coordinates buffer is not split into local and non-local ranges. For other
+     * ranks, the padding size is zero. This works because only one rank ever does
+     * PME work on the GPU, and if that rank also does PP work that is the only
+     * rank. So all coordinates are always transferred.
+     *
+     * In OpenCL, only pmeStream is used since it is the only stream created in
+     * PME context. The local and non-local streams are only needed when buffer
+     * ops are offloaded. This feature is currently not available in OpenCL and
+     * hence these streams are not set in these builds.
+     *
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
+     *  \param[in] wcycle                      Wall cycle counter data.
+     */
+    Impl(const DeviceStreamManager& deviceStreamManager,
+         GpuApiCallBehavior         transferKind,
+         int                        allocationBlockSizeDivisor,
+         gmx_wallcycle*             wcycle);
+
+    /*! \brief Constructor to use in PME-only rank and in tests.
+     *
+     *  This constructor should be used if only a coordinate device buffer should be managed
+     *  using a single stream. Any operation on force or velocity buffer as well as copy of
+     *  non-local coordinates will exit with assertion failure. Note, that the pmeStream can
+     *  not be a nullptr and the constructor will exit with an assertion failure.
+     *
+     *  \todo Currently, unsupported copy operations are blocked by assertion that the stream
+     *        not nullptr. This should be improved.
+     *
+     *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
+     *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
+     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
+     *  \param[in] wcycle          Wall cycle counter data.
+     */
+    Impl(const DeviceStream*  pmeStream,
+         const DeviceContext& deviceContext,
+         GpuApiCallBehavior   transferKind,
+         int                  allocationBlockSizeDivisor,
+         gmx_wallcycle*       wcycle);
+
+    ~Impl();
+
+
+    /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
+     *
+     * \note
+     * The coordinates buffer is (re)allocated, when required by PME, with a padding,
+     * the size of which is set by the constructor. The padding region clearing kernel
+     * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
+     * task uses this padding area.
+     *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
+     *  \param[in] numAtomsLocal  Number of atoms in local domain.
+     *  \param[in] numAtomsAll    Total number of atoms to handle.
+     */
+    void reinit(int numAtomsLocal, int numAtomsAll);
+
+    /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+     *
+     * \todo There are at least three versions of the function with this functionality in the code:
+     *       this one and two more in NBNXM. These should be unified in a shape of a general function
+     *       in DD.
+     *
+     * \param[in]  atomLocality    If all, local or non-local ranges are needed.
+     *
+     * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+     */
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
+
+
+    /*! \brief Get the positions buffer on the GPU.
+     *
+     *  \returns GPU positions buffer.
+     */
+    DeviceBuffer<RVec> getCoordinates();
+
+    /*! \brief Copy positions to the GPU memory.
+     *
+     *  \param[in] h_x           Positions in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     */
+    void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec> h_x, AtomLocality atomLocality);
+
+    /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
+     *
+     * Returns the event synchronizer which indicates that the coordinates are ready for the
+     * consumption on the device. Takes into account that the producer may be different.
+     *
+     * If the update is offloaded, and the current step is not a DD/search step, the returned
+     * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
+     * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
+     * returned synchronizer indicates that the copy is complete.
+     *
+     *  \param[in] atomLocality    Locality of the particles to wait for.
+     *  \param[in] simulationWork  The simulation lifetime flags.
+     *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
+     *
+     *  \returns  The event to synchronize the stream that consumes coordinates on device.
+     */
+    GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                           const SimulationWorkload& simulationWork,
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
+
+    /*! \brief Blocking wait until coordinates are copied to the device.
+     *
+     * Synchronizes the stream in which the copy was executed.
+     *
+     *  \param[in] atomLocality  Locality of the particles to wait for.
+     */
+    void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
+
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Setter for the event synchronizer for the update is done on th GPU
+     *
+     *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
+     */
+    void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
+
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
+     *
+     *  \param[in] h_x           Positions buffer in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
+     */
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
+
+    /*! \brief Wait until coordinates are available on the host.
+     *
+     *  \param[in] atomLocality  Locality of the particles to wait for.
+     */
+    void waitCoordinatesReadyOnHost(AtomLocality atomLocality);
+
+
+    /*! \brief Get the velocities buffer on the GPU.
+     *
+     *  \returns GPU velocities buffer.
+     */
+    DeviceBuffer<RVec> getVelocities();
+
+    /*! \brief Copy velocities to the GPU memory.
+     *
+     *  \param[in] h_v           Velocities in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     */
+    void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec> h_v, AtomLocality atomLocality);
+
+    /*! \brief Copy velocities from the GPU memory.
+     *
+     *  \param[in] h_v           Velocities buffer in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     */
+    void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality);
+
+    /*! \brief Wait until velocities are available on the host.
+     *
+     *  \param[in] atomLocality  Locality of the particles to wait for.
+     */
+    void waitVelocitiesReadyOnHost(AtomLocality atomLocality);
+
+
+    /*! \brief Get the force buffer on the GPU.
+     *
+     *  \returns GPU force buffer.
+     */
+    DeviceBuffer<RVec> getForces();
+
+    /*! \brief Copy forces to the GPU memory.
+     *
+     *  \param[in] h_f           Forces in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     */
+    void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
+
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
+
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *
+     *  Returns either of the event synchronizers, depending on the offload scenario
+     *  for the current simulation timestep:
+     *  1. The forces are copied to the device (when GPU buffer ops are off)
+     *  2. The forces are reduced on the device (GPU buffer ops are on)
+     *
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
+     *
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
+
+    /*! \brief Getter for the event synchronizer for when forces are reduced on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    //! \brief Consume the event for when the forces are reduced on device.
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready for GPU update.
+     */
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Copy forces from the GPU memory.
+     *
+     *  \param[in] h_f           Forces buffer in the host memory.
+     *  \param[in] atomLocality  Locality of the particles to copy.
+     */
+    void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality);
+
+    /*! \brief Wait until forces are available on the host.
+     *
+     *  \param[in] atomLocality  Locality of the particles to wait for.
+     */
+    void waitForcesReadyOnHost(AtomLocality atomLocality);
+
+    /*! \brief Getter for the update stream.
+     *
+     *  \todo This is temporary here, until the management of this stream is taken over.
+     *
+     *  \returns The device command stream to use in update-constraints.
+     */
+    const DeviceStream* getUpdateStream();
+
+    /*! \brief Getter for the number of local atoms.
+     *
+     *  \returns The number of local atoms.
+     */
+    int numAtomsLocal() const;
+
+    /*! \brief Getter for the total number of atoms.
+     *
+     *  \returns The total number of atoms.
+     */
+    int numAtomsAll() const;
+
+private:
+    //! GPU PME stream.
+    const DeviceStream* pmeStream_;
+    //! GPU NBNXM local stream.
+    const DeviceStream* localStream_;
+    //! GPU NBNXM non-local stream.
+    const DeviceStream* nonLocalStream_;
+    //! GPU Update-constraints stream.
+    const DeviceStream* updateStream_;
+
+    // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
+    EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
+    // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
+    EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
+    // Streams to use for forces H2D and D2H copies (one event for each atom locality)
+    EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
+    // Streams internal to this module
+    std::unique_ptr<DeviceStream> copyInStream_;
+    std::unique_ptr<DeviceStream> memsetStream_;
+
+    /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
+     *
+     * \todo Reconsider naming. It should be xCopiedToDevice or xH2DCopyComplete, etc.
+     */
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
+    //! A pointer to an event that the coordinates are ready after update-constraints execution
+    GpuEventSynchronizer* xUpdatedOnDeviceEvent_ = nullptr;
+    //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
+
+    //! An array of events that indicate D2H copy of velocities is complete (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> vReadyOnHost_;
+
+    //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
+    //! An array of events that indicate the forces were reduced on the GPU (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> fReducedOnDevice_;
+    //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
+
+    //! GPU context (for OpenCL builds)
+    const DeviceContext& deviceContext_;
+    //! Default GPU calls behavior
+    GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
+    //! Required minimum divisor of the allocation size of the coordinates buffer
+    int allocationBlockSizeDivisor_ = 0;
+
+    //! Number of local atoms
+    int numAtomsLocal_ = -1;
+    //! Total number of atoms
+    int numAtomsAll_ = -1;
+
+    //! Device positions buffer
+    DeviceBuffer<RVec> d_x_;
+    //! Number of particles saved in the positions buffer
+    int d_xSize_ = -1;
+    //! Allocation size for the positions buffer
+    int d_xCapacity_ = -1;
+
+    //! Device velocities buffer
+    DeviceBuffer<RVec> d_v_;
+    //! Number of particles saved in the velocities buffer
+    int d_vSize_ = -1;
+    //! Allocation size for the velocities buffer
+    int d_vCapacity_ = -1;
+
+    //! Device force buffer
+    DeviceBuffer<RVec> d_f_;
+    //! Number of particles saved in the force buffer
+    int d_fSize_ = -1;
+    //! Allocation size for the force buffer
+    int d_fCapacity_ = -1;
+
+    //! \brief Pointer to wallcycle structure.
+    gmx_wallcycle* wcycle_;
+
+    //! Whether this instance of the class is used on a PME-only rank
+    bool isPmeOnly_ = false;
+
+    /*! \brief Performs the copy of data from host to device buffer.
+     *
+     * \todo Template on locality.
+     *
+     *  \param[out] d_data         Device-side buffer.
+     *  \param[in]  h_data         Host-side buffer.
+     *  \param[in]  dataSize       Device-side data allocation size.
+     *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
+     */
+    void copyToDevice(DeviceBuffer<RVec>             d_data,
+                      gmx::ArrayRef<const gmx::RVec> h_data,
+                      int                            dataSize,
+                      AtomLocality                   atomLocality,
+                      const DeviceStream&            deviceStream);
+
+    /*! \brief Performs the copy of data from device to host buffer.
+     *
+     *  \param[out] h_data         Host-side buffer.
+     *  \param[in]  d_data         Device-side buffer.
+     *  \param[in]  dataSize       Device-side data allocation size.
+     *  \param[in]  atomLocality   If all, local or non-local ranges should be copied.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
+     */
+    void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
+                        DeviceBuffer<RVec>       d_data,
+                        int                      dataSize,
+                        AtomLocality             atomLocality,
+                        const DeviceStream&      deviceStream);
+
+    /*! \brief Performs the clearing of data in device buffer.
+     *
+     * \todo Template on locality.
+     *
+     *  \param[out] d_data         Device-side buffer.
+     *  \param[in]  dataSize       Device-side data allocation size.
+     *  \param[in]  atomLocality   If all, local or non-local ranges should be cleared.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
+     */
+    void clearOnDevice(DeviceBuffer<RVec>  d_data,
+                       int                 dataSize,
+                       AtomLocality        atomLocality,
+                       const DeviceStream& deviceStream) const;
  };
  
-}      // namespace gmx
+} // namespace gmx
  
  #endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H