Centralize management of coordinates ready on GPU event
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 9 Oct 2019 14:02:36 +0000 (16:02 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Sat, 12 Oct 2019 11:18:07 +0000 (13:18 +0200)
This change shifts the ownership of the EventSynchronizer
indicating that coordinates are available on the device from
UpdateConstraints to StatePropagatorDataGpu.
The latter now provides a getter that, based on workload flags, decides
which synchronizer to return. If update is offloaded the returned event
records the completion of the update-constraints on the GPU.
When update is not offloaded and on search steps the returned event
records the completion of the host to device coordinate transfer.

Change-Id: Idec9f46e78a2708fa31e895bf7590cdad1872987

src/gromacs/mdlib/update_constrain_cuda.h
src/gromacs/mdlib/update_constrain_cuda_impl.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdlib/update_constrain_cuda_impl.h
src/gromacs/mdrun/md.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 9ab79720f6b7e65ccf32d0879a2815e664bb0831..3c4fc103f00746f5afa12a5b8bf1b8b1eade7572 100644 (file)
@@ -51,6 +51,8 @@
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 
+class GpuEventSynchronizer;
+
 struct gmx_mtop_t;
 struct t_idef;
 struct t_inputrec;
@@ -66,15 +68,23 @@ class UpdateConstrainCuda
     public:
         /*! \brief Create Update-Constrain object.
          *
-         * \param[in] ir             Input record data: LINCS takes number of iterations and order of
-         *                           projection from it.
-         * \param[in] mtop           Topology of the system: SETTLE gets the masses for O and H atoms
-         *                           and target O-H and H-H distances from this object.
-         * \param[in] commandStream  GPU stream to use. Can be nullptr.
+         * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+         * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
+         * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
+         * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
+         * markEvent(...) method is called unconditionally.
+         *
+         * \param[in] ir                Input record data: LINCS takes number of iterations and order of
+         *                              projection from it.
+         * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
+         *                              and target O-H and H-H distances from this object.
+         * \param[in] commandStream     GPU stream to use. Can be nullptr.
+         * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
          */
-        UpdateConstrainCuda(const t_inputrec  &ir,
-                            const gmx_mtop_t  &mtop,
-                            const void        *commandStream);
+        UpdateConstrainCuda(const t_inputrec     &ir,
+                            const gmx_mtop_t     &mtop,
+                            const void           *commandStream,
+                            GpuEventSynchronizer *xUpdatedOnDevice);
 
         ~UpdateConstrainCuda();
 
@@ -137,7 +147,7 @@ class UpdateConstrainCuda
 
         /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
          */
-        void *getCoordinatesReadySync();
+        GpuEventSynchronizer* getCoordinatesReadySync();
 
     private:
         class Impl;
index 9e9f56e80fcaaea2ad8102fa4756285d8a29e854..647493ec163348ad6041cc4ef1deba8612dae769 100644 (file)
@@ -57,7 +57,8 @@ class UpdateConstrainCuda::Impl
 
 UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec     &ir,
                                          gmx_unused const gmx_mtop_t     &mtop,
-                                         gmx_unused const void           *commandStream)
+                                         gmx_unused const void           *commandStream,
+                                         gmx_unused GpuEventSynchronizer *xUpdatedOnDevice)
     : impl_(nullptr)
 {
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
@@ -98,7 +99,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void* UpdateConstrainCuda::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
 {
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
     return nullptr;
index a58657b0e85dad99346126db743e9dff226e86d5..b146c83ca8b4b850d24b389edce6f7c5b535ead8 100644 (file)
@@ -107,17 +107,21 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
         }
     }
 
-    coordinatesReady_.markEvent(commandStream_);
+    coordinatesReady_->markEvent(commandStream_);
 
     return;
 }
 
-UpdateConstrainCuda::Impl::Impl(const t_inputrec  &ir,
-                                const gmx_mtop_t  &mtop,
-                                const void        *commandStream)
+UpdateConstrainCuda::Impl::Impl(const t_inputrec     &ir,
+                                const gmx_mtop_t     &mtop,
+                                const void           *commandStream,
+                                GpuEventSynchronizer *xUpdatedOnDevice) :
+    coordinatesReady_(xUpdatedOnDevice)
 {
+    GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
     commandStream != nullptr ? commandStream_ = *static_cast<const CommandStream*>(commandStream) : commandStream_ = nullptr;
 
+
     integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
     lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
     settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
@@ -166,18 +170,19 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
 
 void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
 {
-    coordinatesReady_.waitForEvent();
+    coordinatesReady_->waitForEvent();
 }
 
-void *UpdateConstrainCuda::Impl::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
 {
-    return static_cast<void*> (&coordinatesReady_);
+    return coordinatesReady_;
 }
 
-UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec  &ir,
-                                         const gmx_mtop_t  &mtop,
-                                         const void        *commandStream)
-    : impl_(new Impl(ir, mtop, commandStream))
+UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec     &ir,
+                                         const gmx_mtop_t     &mtop,
+                                         const void           *commandStream,
+                                         GpuEventSynchronizer *xUpdatedOnDevice)
+    : impl_(new Impl(ir, mtop, commandStream, xUpdatedOnDevice))
 {
 }
 
@@ -218,7 +223,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
     impl_->waitCoordinatesReadyOnDevice();
 }
 
-void* UpdateConstrainCuda::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
 {
     return impl_->getCoordinatesReadySync();
 }
index 31a05961e8af17fdac0d9dc4887ad31a4a7eaedc..326578d12fc7519b8942f34313b6e081f9c64a32 100644 (file)
@@ -63,17 +63,25 @@ class UpdateConstrainCuda::Impl
 {
 
     public:
-        /*! \brief Create Update-Constrain object
+        /*! \brief Create Update-Constrain object.
          *
-         * \param[in] ir             Input record data: LINCS takes number of iterations and order of
-         *                           projection from it.
-         * \param[in] mtop           Topology of the system: SETTLE gets the masses for O and H atoms
-         *                           and target O-H and H-H distances from this object.
-         * \param[in] commandStream  GPU stream to use. Can be nullptr.
+         * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+         * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
+         * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
+         * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
+         * markEvent(...) method is called unconditionally.
+         *
+         * \param[in] ir                Input record data: LINCS takes number of iterations and order of
+         *                              projection from it.
+         * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
+         *                              and target O-H and H-H distances from this object.
+         * \param[in] commandStream     GPU stream to use. Can be nullptr.
+         * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
          */
         Impl(const t_inputrec     &ir,
              const gmx_mtop_t     &mtop,
-             const void           *commandStream);
+             const void           *commandStream,
+             GpuEventSynchronizer *xUpdatedOnDevice);
 
         ~Impl();
 
@@ -142,7 +150,7 @@ class UpdateConstrainCuda::Impl
 
         /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
          */
-        void *getCoordinatesReadySync();
+        GpuEventSynchronizer* getCoordinatesReadySync();
 
     private:
 
@@ -184,8 +192,8 @@ class UpdateConstrainCuda::Impl
         //! SETTLE CUDA object for water constrains
         std::unique_ptr<SettleCuda>          settleCuda_;
 
-        //! An event to indicate when the update of coordinates is complete
-        GpuEventSynchronizer                 coordinatesReady_;
+        //! An pointer to the event to indicate when the update of coordinates is complete
+        GpuEventSynchronizer                *coordinatesReady_;
 };
 
 } // namespace gmx
index 8b4fcb48e8d494ebabbff10728a26370bf19bd10..b0fccfa1a7c588968b0c48a8e0608471155ed130 100644 (file)
@@ -355,7 +355,7 @@ void gmx::LegacySimulator::do_md()
             GMX_LOG(mdlog.info).asParagraph().
                 appendText("Updating coordinates on the GPU.");
         }
-        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream());
+        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice());
     }
 
     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
index 5c20b44793ea21e01ed2ce770131a556ba74507e..13d00ab245ba4ba7a59820250a8400d798c107ae 100644 (file)
@@ -53,6 +53,7 @@
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 
@@ -173,13 +174,31 @@ class StatePropagatorDataGpu
         void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                   AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer for the H2D coordinates copy.
+        /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         * Returns the event synchronizer which indicates that the coordinates are ready for the
+         * consumption on the device. Takes into account that the producer may be different.
+         *
+         * If the update is offloaded, and the current step is not a DD/search step, the returned
+         * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
+         * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
+         * returned synchronizer indicates that the copy is complete.
+         *
+         *  \param[in] atomLocality    Locality of the particles to wait for.
+         *  \param[in] simulationWork  The simulation lifetime flags.
+         *  \param[in] stepWork        The step lifetime flags.
          *
          *  \returns  The event to synchronize the stream that consumes coordinates on device.
          */
-        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                               const SimulationWorkload &simulationWork,
+                                                               const StepWorkload       &stepWork);
+
+        /*! \brief Getter for the event synchronizer for the update is done on th GPU
+         *
+         *  \returns  The event to synchronize the stream coordinates wre updated on device.
+         */
+        GpuEventSynchronizer* xUpdatedOnDevice();
 
         /*! \brief Copy positions from the GPU memory.
          *
index 6160bdf5a0a49f6a8cf26b32343faf2533a658a2..1f6da3a5d7b0ad67f45130923bd05fbe81f228ff 100644 (file)
@@ -88,7 +88,15 @@ DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
     return DeviceBuffer<float> {};
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              /* atomLocality   */,
+                                                                               const SimulationWorkload & /* simulationWork */,
+                                                                               const StepWorkload       & /* stepWork       */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
 {
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
     return nullptr;
index cb15236738081934eb4571f8ded2e412b1da8e9e..b5fcffeb19ffcc429a99a5057fce781c14786747 100644 (file)
@@ -160,13 +160,31 @@ class StatePropagatorDataGpu::Impl
         void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                   AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer on the H2D coordinates copy.
+        /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         * Returns the event synchronizer which indicates that the coordinates are ready for the
+         * consumption on the device. Takes into account that the producer may be different.
+         *
+         * If the update is offloaded, and the current step is not a DD/search step, the returned
+         * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
+         * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
+         * returned synchronizer indicates that the copy is complete.
+         *
+         *  \param[in] atomLocality    Locality of the particles to wait for.
+         *  \param[in] simulationWork  The simulation lifetime flags.
+         *  \param[in] stepWork        The step lifetime flags.
          *
          *  \returns  The event to synchronize the stream that consumes coordinates on device.
          */
-        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                               const SimulationWorkload &simulationWork,
+                                                               const StepWorkload       &stepWork);
+
+        /*! \brief Getter for the event synchronizer for the update is done on th GPU
+         *
+         *  \returns  The event to synchronize the stream coordinates wre updated on device.
+         */
+        GpuEventSynchronizer* xUpdatedOnDevice();
 
         /*! \brief Copy positions from the GPU memory.
          *
@@ -294,8 +312,13 @@ class StatePropagatorDataGpu::Impl
         // Streams to use for forces H2D and D2H copies (one event for each atom locality)
         EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = {{nullptr}};
 
-        //! An array of events that indicate H2D copy is complete (one event for each atom locality)
+        /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
+         *
+         * \todo Reconsider naming. It should be xCopiedToDevice or xH2DCopyComplete, etc.
+         */
         EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
+        //! An event that the coordinates are ready after update-constraints execution
+        GpuEventSynchronizer                                 xUpdatedOnDevice_;
         //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
 
index 7070891d6c706b42352ae9ba0d2d8e8447b908a5..88bb6e5fedd84b99458c36f45342e6562b90bd6c 100644 (file)
@@ -269,9 +269,30 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     }
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                                                     const SimulationWorkload &simulationWork,
+                                                                                     const StepWorkload       &stepWork)
+{
+    // The provider of the coordinates may be different for local atoms. If the update is offloaded
+    // and this is not a neighbor search step, then the consumer needs to wait for the update
+    // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
+    // the copy event. Non-local coordinates are always provided by the H2D copy.
+    //
+    // TODO: This should be reconsidered to support the halo exchange.
+    //
+    if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
+    {
+        return &xUpdatedOnDevice_;
+    }
+    else
+    {
+        return &xReadyOnDevice_[atomLocality];
+    }
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
 {
-    return &xReadyOnDevice_[atomLocality];
+    return &xUpdatedOnDevice_;
 }
 
 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
@@ -440,9 +461,16 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx:
     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                                               const SimulationWorkload &simulationWork,
+                                                                               const StepWorkload       &stepWork)
+{
+    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
 {
-    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality);
+    return impl_->xUpdatedOnDevice();
 }
 
 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>  h_x,