Add management for velocities and forces copy events to StatePropagatorDataGpu
authorArtem Zhmurov <zhmurov@gmail.com>
Fri, 4 Oct 2019 15:48:34 +0000 (17:48 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Thu, 10 Oct 2019 10:15:43 +0000 (12:15 +0200)
All H2D and D2H copies of velocities and forces now record an event, methods
to synchronize on those events are added to the class.

Change-Id: I910c5834d83f317f12c1fe0cd71ced168f412386

src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index a061917ca70cf2d3e74322ed1998e951fcf9bc37..5c20b44793ea21e01ed2ce770131a556ba74507e 100644 (file)
@@ -173,7 +173,7 @@ class StatePropagatorDataGpu
         void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                   AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer on the H2D coordinates copy.
+        /*! \brief Get the event synchronizer for the H2D coordinates copy.
          *
          *  \param[in] atomLocality  Locality of the particles to wait for.
          *
@@ -210,6 +210,14 @@ class StatePropagatorDataGpu
         void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
                                  AtomLocality                    atomLocality);
 
+        /*! \brief Get the event synchronizer for the H2D velocities copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes velocities on device.
+         */
+        GpuEventSynchronizer* getVelocitiesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
         /*! \brief Copy velocities from the GPU memory.
          *
          *  \param[in] h_v           Velocities buffer in the host memory.
@@ -218,6 +226,12 @@ class StatePropagatorDataGpu
         void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
                                    AtomLocality              atomLocality);
 
+        /*! \brief Wait until velocities are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitVelocitiesReadyOnHost(AtomLocality  atomLocality);
+
 
         /*! \brief Get the force buffer on the GPU.
          *
@@ -233,6 +247,14 @@ class StatePropagatorDataGpu
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
+        /*! \brief Get the event synchronizer for the H2D forces copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes forces on device.
+         */
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
         /*! \brief Copy forces from the GPU memory.
          *
          *  \param[in] h_f           Forces buffer in the host memory.
@@ -241,6 +263,12 @@ class StatePropagatorDataGpu
         void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
                                AtomLocality              atomLocality);
 
+        /*! \brief Wait until forces are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitForcesReadyOnHost(AtomLocality  atomLocality);
+
         /*! \brief Getter for the update stream.
          *
          *  \todo This is temporary here, until the management of this stream is taken over.
index 7e2072f7dab40a32a5c2ff4e4661a08ea88cf376..6160bdf5a0a49f6a8cf26b32343faf2533a658a2 100644 (file)
@@ -124,12 +124,23 @@ void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::getVelocitiesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_v          */,
                                                    AtomLocality              /* atomLocality */)
 {
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
+void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
 
 DeviceBuffer<float> StatePropagatorDataGpu::getForces()
 {
@@ -143,12 +154,24 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return nullptr;
+}
+
 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_f          */,
                                                AtomLocality              /* atomLocality */)
 {
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
+void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
+
 void* StatePropagatorDataGpu::getUpdateStream()
 {
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
index 037eeadedf7c9c4a549c7afcae41f12323c48667..cb15236738081934eb4571f8ded2e412b1da8e9e 100644 (file)
@@ -197,6 +197,14 @@ class StatePropagatorDataGpu::Impl
         void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
                                  AtomLocality                    atomLocality);
 
+        /*! \brief Get the event synchronizer on the H2D velocities copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes velocities on device.
+         */
+        GpuEventSynchronizer* getVelocitiesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
         /*! \brief Copy velocities from the GPU memory.
          *
          *  \param[in] h_v           Velocities buffer in the host memory.
@@ -205,6 +213,12 @@ class StatePropagatorDataGpu::Impl
         void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
                                    AtomLocality              atomLocality);
 
+        /*! \brief Wait until velocities are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitVelocitiesReadyOnHost(AtomLocality  atomLocality);
+
 
         /*! \brief Get the force buffer on the GPU.
          *
@@ -220,6 +234,14 @@ class StatePropagatorDataGpu::Impl
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
+        /*! \brief Get the event synchronizer on the H2D forces copy.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *
+         *  \returns  The event to synchronize the stream that consumes forces on device.
+         */
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+
         /*! \brief Copy forces from the GPU memory.
          *
          *  \param[in] h_f           Forces buffer in the host memory.
@@ -228,6 +250,12 @@ class StatePropagatorDataGpu::Impl
         void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
                                AtomLocality              atomLocality);
 
+        /*! \brief Wait until forces are available on the host.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitForcesReadyOnHost(AtomLocality  atomLocality);
+
         /*! \brief Getter for the update stream.
          *
          *  \todo This is temporary here, until the management of this stream is taken over.
@@ -259,14 +287,28 @@ class StatePropagatorDataGpu::Impl
         //! GPU Update-constreaints stream.
         CommandStream        updateStream_               = nullptr;
 
-        // Streams to use for coordinates H2S and D2H copies (one event for each atom locality)
+        // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
         EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = {{nullptr}};
+        // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
+        EnumerationArray<AtomLocality, CommandStream> vCopyStreams_ = {{nullptr}};
+        // Streams to use for forces H2D and D2H copies (one event for each atom locality)
+        EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = {{nullptr}};
 
         //! An array of events that indicate H2D copy is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
-        //! An array of events that indicate D2H copy is complete (one event for each atom locality)
+        //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
 
+        //! An array of events that indicate H2D copy of velocities is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> vReadyOnDevice_;
+        //! An array of events that indicate D2H copy of velocities is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> vReadyOnHost_;
+
+        //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
+        //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
+        EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
+
         /*! \brief GPU context (for OpenCL builds)
          * \todo Make a Context class usable in CPU code
          */
index ba6850db45ed7f2e187b626794fd8f8f7cf4deff..7070891d6c706b42352ae9ba0d2d8e8447b908a5 100644 (file)
@@ -112,11 +112,20 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
         GMX_UNUSED_VALUE(deviceContext);
     }
 
-    // Map the atom locality to the stream that will be used for coordinates transfer.
-    // Same streams are used for H2D and D2H copies
+    // Map the atom locality to the stream that will be used for coordinates,
+    // velocities and forces transfers. Same streams are used for H2D and D2H copies.
+    // Note, that nullptr stream is used here to indicate that the copy is not supported.
     xCopyStreams_[AtomLocality::Local]    = updateStream_;
     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
     xCopyStreams_[AtomLocality::All]      = updateStream_;
+
+    vCopyStreams_[AtomLocality::Local]    = updateStream_;
+    vCopyStreams_[AtomLocality::NonLocal] = nullptr;
+    vCopyStreams_[AtomLocality::All]      = updateStream_;
+
+    fCopyStreams_[AtomLocality::Local]    = localStream_;
+    fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
+    fCopyStreams_[AtomLocality::All]      = nullptr;
 }
 
 StatePropagatorDataGpu::Impl::~Impl()
@@ -293,15 +302,38 @@ DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_v,
                                                        AtomLocality                          atomLocality)
 {
-    // TODO: Use the correct stream
-    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying velocities with given atom locality.");
+
+    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
+    // TODO: Remove When event-based synchronization is introduced
+    gpuStreamSynchronize(commandStream);
+    vReadyOnDevice_[atomLocality].markEvent(commandStream);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return &vReadyOnDevice_[atomLocality];
 }
 
+
 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
                                                          AtomLocality              atomLocality)
 {
-    // TODO: Use the correct stream
-    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = vCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying velocities with given atom locality.");
+
+    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
+    // TODO: Remove When event-based synchronization is introduced
+    gpuStreamSynchronize(commandStream);
+    vReadyOnHost_[atomLocality].markEvent(commandStream);
+}
+
+void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality  atomLocality)
+{
+    vReadyOnHost_[atomLocality].waitForEvent();
 }
 
 
@@ -313,15 +345,38 @@ DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_f,
                                                    AtomLocality                          atomLocality)
 {
-    // TODO: Use the correct stream
-    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
+
+    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
+    // TODO: Remove When event-based synchronization is introduced
+    gpuStreamSynchronize(commandStream);
+    fReadyOnDevice_[atomLocality].markEvent(commandStream);
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return &fReadyOnDevice_[atomLocality];
+}
+
+
 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
                                                      AtomLocality              atomLocality)
 {
-    // TODO: Use the correct stream
-    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, nullptr);
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    CommandStream commandStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
+
+    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
+    // TODO: Remove When event-based synchronization is introduced
+    gpuStreamSynchronize(commandStream);
+    fReadyOnHost_[atomLocality].markEvent(commandStream);
+}
+
+void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality  atomLocality)
+{
+    fReadyOnHost_[atomLocality].waitForEvent();
 }
 
 void* StatePropagatorDataGpu::Impl::getUpdateStream()
@@ -413,12 +468,22 @@ void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::
     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::getVelocitiesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return impl_->getVelocitiesReadyOnDeviceEvent(atomLocality);
+}
+
 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec>  h_v,
                                                    AtomLocality         atomLocality)
 {
     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
 }
 
+void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality  atomLocality)
+{
+    return impl_->waitVelocitiesReadyOnHost(atomLocality);
+}
+
 
 DeviceBuffer<float> StatePropagatorDataGpu::getForces()
 {
@@ -431,12 +496,23 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     return impl_->copyForcesToGpu(h_f, atomLocality);
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+{
+    return impl_->getForcesReadyOnDeviceEvent(atomLocality);
+}
+
 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec>  h_f,
                                                AtomLocality         atomLocality)
 {
     return impl_->copyForcesFromGpu(h_f, atomLocality);
 }
 
+void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality  atomLocality)
+{
+    return impl_->waitForcesReadyOnHost(atomLocality);
+}
+
+
 void* StatePropagatorDataGpu::getUpdateStream()
 {
     return impl_->getUpdateStream();