Balance event consumption for GPU update code path

[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu.h
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index d75cd78ea1e2b742d559580b7d92ccfe5144a019..4cf4c7998d28a915907fbc4d1f65c596db643d1e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,6 +50,9 @@
  #ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
  #define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
  
+#include <memory>
+#include <tuple>
+
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/vectypes.h"
@@ -59,11 +62,14 @@
  
  #include "locality.h"
  
+class DeviceContext;
+class DeviceStream;
  class GpuEventSynchronizer;
  struct gmx_wallcycle;
  
  namespace gmx
  {
+class DeviceStreamManager;
  
  class StatePropagatorDataGpu
  {
@@ -84,40 +90,15 @@ public:
       * ops are offloaded. This feature is currently not available in OpenCL and
       * hence these streams are not set in these builds.
       *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     * \todo Make a \p CommandStream visible in the CPU parts of the code so we
-     *       will not have to pass a void*.
-     * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-     *       will not have to pass a void*.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
+     *  \param[in] wcycle                      Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        localStream,
-                           const void*        nonLocalStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                           GpuApiCallBehavior         transferKind,
+                           int                        allocationBlockSizeDivisor,
+                           gmx_wallcycle*             wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
       *
@@ -132,14 +113,14 @@ public:
       *  \param[in] pmeStream       Device PME stream, nullptr is not allowed.
       *  \param[in] deviceContext   Device context, nullptr allowed for non-OpenCL builds.
       *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
-     *  \param[in] paddingSize     Padding size for coordinates buffer.
+     *  \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
+                           const DeviceContext& deviceContext,
+                           GpuApiCallBehavior   transferKind,
+                           int                  allocationBlockSizeDivisor,
+                           gmx_wallcycle*       wcycle);
  
      //! Move constructor
      StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
@@ -177,7 +158,7 @@ public:
       *
       * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
       */
-    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
  
  
      /*! \brief Get the positions buffer on the GPU.
@@ -187,6 +168,10 @@ public:
      DeviceBuffer<RVec> getCoordinates();
  
      /*! \brief Copy positions to the GPU memory.
+     *
+     * Use \ref getCoordinatesReadyOnDeviceEvent to get the associated event synchronizer or
+     * \ref waitCoordinatesCopiedToDevice to wait for the copy completion.
+     * Note: the event is not marked in OpenCL, because it is not used.
       *
       *  \param[in] h_x           Positions in the host memory.
       *  \param[in] atomLocality  Locality of the particles to copy.
@@ -203,15 +188,17 @@ public:
       * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
       * returned synchronizer indicates that the copy is complete.
       *
-     *  \param[in] atomLocality    Locality of the particles to wait for.
-     *  \param[in] simulationWork  The simulation lifetime flags.
-     *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] atomLocality              Locality of the particles to wait for.
+     *  \param[in] simulationWork            The simulation lifetime flags.
+     *  \param[in] stepWork                  The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
       *
       *  \returns  The event to synchronize the stream that consumes coordinates on device.
       */
      GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                             const SimulationWorkload& simulationWork,
-                                                           const StepWorkload&       stepWork);
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
  
      /*! \brief Blocking wait until coordinates are copied to the device.
       *
@@ -221,18 +208,37 @@ public:
       */
      void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
  
-    /*! \brief Getter for the event synchronizer for the update is done on th GPU
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
       *
-     *  \returns  The event to synchronize the stream coordinates wre updated on device.
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Setter for the event synchronizer for the update is done on th GPU
+     *
+     *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
       */
-    GpuEventSynchronizer* xUpdatedOnDevice();
+    void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
  
-    /*! \brief Copy positions from the GPU memory.
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
       *
       *  \param[in] h_x           Positions buffer in the host memory.
       *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
       */
-    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
  
      /*! \brief Wait until coordinates are available on the host.
       *
@@ -248,20 +254,14 @@ public:
      DeviceBuffer<RVec> getVelocities();
  
      /*! \brief Copy velocities to the GPU memory.
+     *
+     * Does not mark any event, because we don't use it anywhere at the moment.
       *
       *  \param[in] h_v           Velocities in the host memory.
       *  \param[in] atomLocality  Locality of the particles to copy.
       */
      void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec> h_v, AtomLocality atomLocality);
  
-    /*! \brief Get the event synchronizer for the H2D velocities copy.
-     *
-     *  \param[in] atomLocality  Locality of the particles to wait for.
-     *
-     *  \returns  The event to synchronize the stream that consumes velocities on device.
-     */
-    GpuEventSynchronizer* getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality);
-
      /*! \brief Copy velocities from the GPU memory.
       *
       *  \param[in] h_v           Velocities buffer in the host memory.
@@ -289,6 +289,13 @@ public:
       */
      void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
  
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
+
      /*! \brief Get the event synchronizer for the forces ready on device.
       *
       *  Returns either of the event synchronizers, depending on the offload scenario
@@ -296,20 +303,33 @@ public:
       *  1. The forces are copied to the device (when GPU buffer ops are off)
       *  2. The forces are reduced on the device (GPU buffer ops are on)
       *
-     *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
-     *
-     *  \param[in] atomLocality      Locality of the particles to wait for.
-     *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
       *
       *  \returns  The event to synchronize the stream that consumes forces on device.
       */
-    GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
  
      /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
       *
-     *  \returns  The event to mark when forces are reduced on the GPU.
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Consume the event for when the forces are reduced on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     */
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready on the GPU.
       */
-    GpuEventSynchronizer* fReducedOnDevice();
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
  
      /*! \brief Copy forces from the GPU memory.
       *
@@ -330,23 +350,23 @@ public:
       *
       *  \returns The device command stream to use in update-constraints.
       */
-    void* getUpdateStream();
+    const DeviceStream* getUpdateStream();
  
      /*! \brief Getter for the number of local atoms.
       *
       *  \returns The number of local atoms.
       */
-    int numAtomsLocal();
+    int numAtomsLocal() const;
  
      /*! \brief Getter for the total number of atoms.
       *
       *  \returns The total number of atoms.
       */
-    int numAtomsAll();
+    int numAtomsAll() const;
  
  private:
      class Impl;
-    gmx::PrivateImplPointer<Impl> impl_;
+    std::unique_ptr<Impl> impl_;
      GMX_DISALLOW_COPY_AND_ASSIGN(StatePropagatorDataGpu);
  };