Add management for coordinates copy events into StatePropagatorDataGpu

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 30 Sep 2019 13:04:10 +0000 (15:04 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Thu, 10 Oct 2019 08:39:49 +0000 (10:39 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 30 Sep 2019 13:04:10 +0000 (15:04 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Thu, 10 Oct 2019 08:39:49 +0000 (10:39 +0200)
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index ff43c4a807c6f03e920c6d1105b4d32f02682c1b..a061917ca70cf2d3e74322ed1998e951fcf9bc37 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -56,6 +56,8 @@
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
  
+class GpuEventSynchronizer;
+
  namespace gmx
  {
  
@@ -87,14 +89,29 @@ class StatePropagatorDataGpu
           * PME work on the GPU, and if that rank also does PP work that is the only
           * rank. So all coordinates are always transferred.
           *
-         * \note \p commandStream and \p deviceContext are allowed to be nullptr if
-         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
-         *       does not run on the GPU).
+         * In OpenCL, only pmeStream is used since it is the only stream created in
+         * PME context. The local and non-local streams are only needed when buffer
+         * ops are offloaded. This feature is currently not available in OpenCL and
+         * hence these streams are not set in these builds.
+         *
+         * \note In CUDA, the update stream is created in the constructor as a temporary
+         *       solution, in place until the stream manager is introduced.
+         *       Note that this makes it impossible to construct this object in CUDA
+         *       builds executing on a host without any CUDA-capable device available.
+         *
+         * \note In CUDA, \p deviceContext is unused, hence always nullptr;
+         *       all stream arguments can also be nullptr in runs where the
+         *       respective streams are not required.
+         *       In OpenCL, \p deviceContext needs to be a valid device context.
+         *       In OpenCL runs StatePropagatorDataGpu is currently only used
+         *       with PME offload, and only on ranks with PME duty. Hence, the
+         *       \p pmeStream argument needs to be a valid OpenCL queue object
+         *       which must have been created in \p deviceContext.
           *
-         * \todo A CommandStream is now visible in the CPU parts of the code so we
-         *       can stop passing a void*.
-         * \todo A DeviceContext object is visible in CPU parts of the code so we
-         *       can stop passing a void*.
+         * \todo Make a \p CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
+         *       will not have to pass a void*.
           *
           *  \param[in] pmeStream       Device PME stream, nullptr allowed.
           *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -118,8 +135,11 @@ class StatePropagatorDataGpu
  
          /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
           *
-         * The coordinates buffer is reallocated with the padding added at the end. The
-         * size of padding is set by the constructor.
+         * \note
+         * The coordinates buffer is (re)allocated, when required by PME, with a padding,
+         * the size of which is set by the constructor. The padding region clearing kernel
+         * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
+         * task uses this padding area.
           *
           *  \param[in] numAtomsLocal  Number of atoms in local domain.
           *  \param[in] numAtomsAll    Total number of atoms to handle.
@@ -153,6 +173,14 @@ class StatePropagatorDataGpu
          void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                    AtomLocality                    atomLocality);
  
+        /*! \brief Get the event synchronizer on the H2D coordinates copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes coordinates on device.
+         */
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
          /*! \brief Copy positions from the GPU memory.
           *
           *  \param[in] h_x           Positions buffer in the host memory.
@@ -161,6 +189,12 @@ class StatePropagatorDataGpu
          void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
                                      AtomLocality              atomLocality);
  
+        /*! \brief Wait until coordinates are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesReadyOnHost(AtomLocality  atomLocality);
+
  
          /*! \brief Get the velocities buffer on the GPU.
           *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 6a7dd647307fd47f0302fa9fb6da58f502364ba7..7e2072f7dab40a32a5c2ff4e4661a08ea88cf376 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -73,86 +73,97 @@ StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
  void StatePropagatorDataGpu::reinit(int  /* numAtomsLocal */,
                                      int  /* numAtomsAll   */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality  /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return std::make_tuple(0, 0);
  }
  
  DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return DeviceBuffer<float> {};
  }
  
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
  void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_x          */,
                                                    AtomLocality                          /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality   /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_x          */,
                                                      AtomLocality              /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  
  DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return DeviceBuffer<float> {};
  }
  
  void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_v          */,
                                                   AtomLocality                          /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_v          */,
                                                     AtomLocality              /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  
  DeviceBuffer<float> StatePropagatorDataGpu::getForces()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return DeviceBuffer<float> {};
  }
  
  void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_f          */,
                                               AtomLocality                          /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_f          */,
                                                 AtomLocality              /* atomLocality */)
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  }
  
  void* StatePropagatorDataGpu::getUpdateStream()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return nullptr;
  }
  
  int StatePropagatorDataGpu::numAtomsLocal()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return 0;
  }
  
  int StatePropagatorDataGpu::numAtomsAll()
  {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return 0;
  }
  
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 0b73395af8939a4e8d50bbe409be8aab77cb9d2d..037eeadedf7c9c4a549c7afcae41f12323c48667 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -45,10 +45,18 @@
  
  #include "gmxpre.h"
  
+#include "config.h"
+
  #include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
+#endif
  #include "gromacs/math/vectypes.h"
  #include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/enumerationhelpers.h"
  
  namespace gmx
  {
@@ -71,14 +79,29 @@ class StatePropagatorDataGpu::Impl
           * PME work on the GPU, and if that rank also does PP work that is the only
           * rank. So all coordinates are always transferred.
           *
-         * \note \p commandStream and \p deviceContext are allowed to be nullptr if
-         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
-         *       does not run on the GPU).
+         * In OpenCL, only pmeStream is used since it is the only stream created in
+         * PME context. The local and non-local streams are only needed when buffer
+         * ops are offloaded. This feature is currently not available in OpenCL and
+         * hence these streams are not set in these builds.
+         *
+         * \note In CUDA, the update stream is created in the constructor as a temporary
+         *       solution, in place until the stream manager is introduced.
+         *       Note that this makes it impossible to construct this object in CUDA
+         *       builds executing on a host without any CUDA-capable device available.
           *
-         * \todo A CommandStream is now visible in the CPU parts of the code so we
-         *       can stop passing a void*.
-         * \todo A DeviceContext object is visible in CPU parts of the code so we
-         *       can stop passing a void*.
+         * \note In CUDA, \p deviceContext is unused, hence always nullptr;
+         *       all stream arguments can also be nullptr in runs where the
+         *       respective streams are not required.
+         *       In OpenCL, \p deviceContext needs to be a valid device context.
+         *       In OpenCL runs StatePropagatorDataGpu is currently only used
+         *       with PME offload, and only on ranks with PME duty. Hence, the
+         *       \p pmeStream argument needs to be a valid OpenCL queue object
+         *       which must have been created in \p deviceContext.
+         *
+         * \todo Make a \p CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
+         *       will not have to pass a void*.
           *
           *  \param[in] pmeStream       Device PME stream, nullptr allowed.
           *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -99,11 +122,11 @@ class StatePropagatorDataGpu::Impl
  
          /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
           *
-         * The coordinates buffer is reallocated with the padding added at the end. The
-         * size of padding is set by the constructor.
-         *
-         * \note The PME requires clearing of the padding, which is done in the pmeStream_.
-         *       Hence the pmeStream_ should be created in the gpuContext_.
+         * \note
+         * The coordinates buffer is (re)allocated, when required by PME, with a padding,
+         * the size of which is set by the constructor. The padding region clearing kernel
+         * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
+         * task uses this padding area.
           *
           *  \param[in] numAtomsLocal  Number of atoms in local domain.
           *  \param[in] numAtomsAll    Total number of atoms to handle.
@@ -137,6 +160,14 @@ class StatePropagatorDataGpu::Impl
          void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                    AtomLocality                    atomLocality);
  
+        /*! \brief Get the event synchronizer on the H2D coordinates copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes coordinates on device.
+         */
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
          /*! \brief Copy positions from the GPU memory.
           *
           *  \param[in] h_x           Positions buffer in the host memory.
@@ -145,6 +176,12 @@ class StatePropagatorDataGpu::Impl
          void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
                                      AtomLocality              atomLocality);
  
+        /*! \brief Wait until coordinates are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesReadyOnHost(AtomLocality  atomLocality);
+
  
          /*! \brief Get the velocities buffer on the GPU.
           *
@@ -221,6 +258,15 @@ class StatePropagatorDataGpu::Impl
          CommandStream        nonLocalStream_             = nullptr;
          //! GPU Update-constreaints stream.
          CommandStream        updateStream_               = nullptr;
+
+        // Streams to use for coordinates H2S and D2H copies (one event for each atom locality)
+        EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = {{nullptr}};
+
+        //! An array of events that indicate H2D copy is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
+        //! An array of events that indicate D2H copy is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
+
          /*! \brief GPU context (for OpenCL builds)
           * \todo Make a Context class usable in CPU code
           */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 3cc0bb7653bc4286265bb178ec67785732f5f135..ba6850db45ed7f2e187b626794fd8f8f7cf4deff 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -71,33 +71,52 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
      transferKind_(transferKind),
      paddingSize_(paddingSize)
  {
-
+    static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
      GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
  
-    if (pmeStream != nullptr)
-    {
-        pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
-    }
-    if (localStream != nullptr)
+    // TODO: Refactor when the StreamManager is introduced.
+    if (GMX_GPU == GMX_GPU_OPENCL)
      {
-        localStream_ = *static_cast<const CommandStream*>(localStream);
+        GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
+        GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
+
+        // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
+        pmeStream_      = *static_cast<const CommandStream*>(pmeStream);
+        updateStream_   = *static_cast<const CommandStream*>(pmeStream);
+        deviceContext_  = *static_cast<const DeviceContext*>(deviceContext);
+        GMX_UNUSED_VALUE(localStream);
+        GMX_UNUSED_VALUE(nonLocalStream);
      }
-    if (nonLocalStream != nullptr)
+
+    if (GMX_GPU == GMX_GPU_CUDA)
      {
-        nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
-    }
-// The OpenCL build will never use the updateStream
-// TODO: The update stream should be created only when it is needed.
-#if GMX_GPU == GMX_GPU_CUDA
-    cudaError_t stat;
-    stat = cudaStreamCreate(&updateStream_);
-    CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
+        if (pmeStream != nullptr)
+        {
+            pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+        }
+        if (localStream != nullptr)
+        {
+            localStream_ = *static_cast<const CommandStream*>(localStream);
+        }
+        if (nonLocalStream != nullptr)
+        {
+            nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
+        }
+
+        // TODO: The update stream should be created only when it is needed.
+#if (GMX_GPU == GMX_GPU_CUDA)
+        cudaError_t stat;
+        stat = cudaStreamCreate(&updateStream_);
+        CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
  #endif
-    if (deviceContext != nullptr)
-    {
-        deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+        GMX_UNUSED_VALUE(deviceContext);
      }
  
+    // Map the atom locality to the stream that will be used for coordinates transfer.
+    // Same streams are used for H2D and D2H copies
+    xCopyStreams_[AtomLocality::Local]    = updateStream_;
+    xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
+    xCopyStreams_[AtomLocality::All]      = updateStream_;
  }
  
  StatePropagatorDataGpu::Impl::~Impl()
@@ -106,9 +125,6 @@ StatePropagatorDataGpu::Impl::~Impl()
  
  void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
  {
-#if GMX_GPU == GMX_GPU_OPENCL
-    GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
-#endif
      numAtomsLocal_ = numAtomsLocal;
      numAtomsAll_   = numAtomsAll;
  
@@ -127,9 +143,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
      if (paddingAllocationSize > 0)
      {
-        // The PME stream is used here because:
-        // 1. The padding clearing is only needed by PME.
-        // 2. It is the stream that is created in the PME OpenCL context.
+        // The PME stream is used here because the padding region of d_x_ is only in the PME task.
          clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, pmeStream_);
      }
  
@@ -171,14 +185,6 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>
                                                  CommandStream                         commandStream)
  {
  
-#if GMX_GPU == GMX_GPU_OPENCL
-    GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
-    // The PME stream is used for OpenCL builds, because it is the context that it associated with the
-    // PME task which requires the coordinates managed here in OpenCL.
-    // TODO: This will have to be changed when the OpenCL implementation will be extended.
-    commandStream = pmeStream_;
-#endif
-
      GMX_UNUSED_VALUE(dataSize);
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
@@ -207,14 +213,6 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_da
                                                    CommandStream             commandStream)
  {
  
-#if GMX_GPU == GMX_GPU_OPENCL
-    GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds.");
-    // The PME stream is used for OpenCL builds, because it is the context that it associated with the
-    // PME task which requires the coordinates managed here in OpenCL.
-    // TODO: This will have to be changed when the OpenCL implementation will be extended.
-    commandStream = pmeStream_;
-#endif
-
      GMX_UNUSED_VALUE(dataSize);
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
@@ -244,15 +242,46 @@ DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
  void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_x,
                                                          AtomLocality                          atomLocality)
  {
-    // TODO: Use the correct stream
-    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality.");
+
+    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+
+    // markEvent is skipped in OpenCL as:
+    //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
+    //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
+    // TODO: remove this by adding an event-mark free flavor of this function
+    if (GMX_GPU == GMX_GPU_CUDA)
+    {
+        xReadyOnDevice_[atomLocality].markEvent(commandStream);
+        // TODO: Remove When event-based synchronization is introduced
+        gpuStreamSynchronize(commandStream);
+    }
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return &xReadyOnDevice_[atomLocality];
  }
  
  void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
                                                            AtomLocality              atomLocality)
  {
-    // TODO: Use the correct stream
-    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = xCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying positions with given atom locality.");
+
+    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+    // TODO: Remove When event-based synchronization is introduced
+    gpuStreamSynchronize(commandStream);
+    // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
+    xReadyOnHost_[atomLocality].markEvent(commandStream);
+}
+
+void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality  atomLocality)
+{
+    xReadyOnHost_[atomLocality].waitForEvent();
  }
  
  
@@ -356,12 +385,22 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx:
      return impl_->copyCoordinatesToGpu(h_x, atomLocality);
  }
  
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality);
+}
+
  void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>  h_x,
                                                      AtomLocality         atomLocality)
  {
      return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
  }
  
+void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality  atomLocality)
+{
+    return impl_->waitCoordinatesReadyOnHost(atomLocality);
+}
+
  
  DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
  {
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 30 Sep 2019 13:04:10 +0000 (15:04 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 10 Oct 2019 08:39:49 +0000 (10:39 +0200)
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history