Centralize management of coordinates ready on GPU event

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 9 Oct 2019 14:02:36 +0000 (16:02 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sat, 12 Oct 2019 11:18:07 +0000 (13:18 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 9 Oct 2019 14:02:36 +0000 (16:02 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sat, 12 Oct 2019 11:18:07 +0000 (13:18 +0200)
diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h

index 9ab79720f6b7e65ccf32d0879a2815e664bb0831..3c4fc103f00746f5afa12a5b8bf1b8b1eade7572 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda.h
+++ b/src/gromacs/mdlib/update_constrain_cuda.h
@@ -51,6 +51,8 @@
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
  
+class GpuEventSynchronizer;
+
  struct gmx_mtop_t;
  struct t_idef;
  struct t_inputrec;
@@ -66,15 +68,23 @@ class UpdateConstrainCuda
      public:
          /*! \brief Create Update-Constrain object.
           *
-         * \param[in] ir             Input record data: LINCS takes number of iterations and order of
-         *                           projection from it.
-         * \param[in] mtop           Topology of the system: SETTLE gets the masses for O and H atoms
-         *                           and target O-H and H-H distances from this object.
-         * \param[in] commandStream  GPU stream to use. Can be nullptr.
+         * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+         * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
+         * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
+         * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
+         * markEvent(...) method is called unconditionally.
+         *
+         * \param[in] ir                Input record data: LINCS takes number of iterations and order of
+         *                              projection from it.
+         * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
+         *                              and target O-H and H-H distances from this object.
+         * \param[in] commandStream     GPU stream to use. Can be nullptr.
+         * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
           */
-        UpdateConstrainCuda(const t_inputrec  &ir,
-                            const gmx_mtop_t  &mtop,
-                            const void        *commandStream);
+        UpdateConstrainCuda(const t_inputrec     &ir,
+                            const gmx_mtop_t     &mtop,
+                            const void           *commandStream,
+                            GpuEventSynchronizer *xUpdatedOnDevice);
  
          ~UpdateConstrainCuda();
  
@@ -137,7 +147,7 @@ class UpdateConstrainCuda
  
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
-        void *getCoordinatesReadySync();
+        GpuEventSynchronizer* getCoordinatesReadySync();
  
      private:
          class Impl;
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp

index 9e9f56e80fcaaea2ad8102fa4756285d8a29e854..647493ec163348ad6041cc4ef1deba8612dae769 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
@@ -57,7 +57,8 @@ class UpdateConstrainCuda::Impl
  
  UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec     &ir,
                                           gmx_unused const gmx_mtop_t     &mtop,
-                                         gmx_unused const void           *commandStream)
+                                         gmx_unused const void           *commandStream,
+                                         gmx_unused GpuEventSynchronizer *xUpdatedOnDevice)
      : impl_(nullptr)
  {
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
@@ -98,7 +99,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
  }
  
-void* UpdateConstrainCuda::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
      return nullptr;
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu

index a58657b0e85dad99346126db743e9dff226e86d5..b146c83ca8b4b850d24b389edce6f7c5b535ead8 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -107,17 +107,21 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
          }
      }
  
-    coordinatesReady_.markEvent(commandStream_);
+    coordinatesReady_->markEvent(commandStream_);
  
      return;
  }
  
-UpdateConstrainCuda::Impl::Impl(const t_inputrec  &ir,
-                                const gmx_mtop_t  &mtop,
-                                const void        *commandStream)
+UpdateConstrainCuda::Impl::Impl(const t_inputrec     &ir,
+                                const gmx_mtop_t     &mtop,
+                                const void           *commandStream,
+                                GpuEventSynchronizer *xUpdatedOnDevice) :
+    coordinatesReady_(xUpdatedOnDevice)
  {
+    GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
      commandStream != nullptr ? commandStream_ = *static_cast<const CommandStream*>(commandStream) : commandStream_ = nullptr;
  
+
      integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
      lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
      settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
@@ -166,18 +170,19 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
  
  void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
  {
-    coordinatesReady_.waitForEvent();
+    coordinatesReady_->waitForEvent();
  }
  
-void *UpdateConstrainCuda::Impl::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
  {
-    return static_cast<void*> (&coordinatesReady_);
+    return coordinatesReady_;
  }
  
-UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec  &ir,
-                                         const gmx_mtop_t  &mtop,
-                                         const void        *commandStream)
-    : impl_(new Impl(ir, mtop, commandStream))
+UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec     &ir,
+                                         const gmx_mtop_t     &mtop,
+                                         const void           *commandStream,
+                                         GpuEventSynchronizer *xUpdatedOnDevice)
+    : impl_(new Impl(ir, mtop, commandStream, xUpdatedOnDevice))
  {
  }
  
@@ -218,7 +223,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
      impl_->waitCoordinatesReadyOnDevice();
  }
  
-void* UpdateConstrainCuda::getCoordinatesReadySync()
+GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      return impl_->getCoordinatesReadySync();
  }
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h

index 31a05961e8af17fdac0d9dc4887ad31a4a7eaedc..326578d12fc7519b8942f34313b6e081f9c64a32 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.h
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h
@@ -63,17 +63,25 @@ class UpdateConstrainCuda::Impl
  {
  
      public:
-        /*! \brief Create Update-Constrain object
+        /*! \brief Create Update-Constrain object.
           *
-         * \param[in] ir             Input record data: LINCS takes number of iterations and order of
-         *                           projection from it.
-         * \param[in] mtop           Topology of the system: SETTLE gets the masses for O and H atoms
-         *                           and target O-H and H-H distances from this object.
-         * \param[in] commandStream  GPU stream to use. Can be nullptr.
+         * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+         * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
+         * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
+         * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
+         * markEvent(...) method is called unconditionally.
+         *
+         * \param[in] ir                Input record data: LINCS takes number of iterations and order of
+         *                              projection from it.
+         * \param[in] mtop              Topology of the system: SETTLE gets the masses for O and H atoms
+         *                              and target O-H and H-H distances from this object.
+         * \param[in] commandStream     GPU stream to use. Can be nullptr.
+         * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
           */
          Impl(const t_inputrec     &ir,
               const gmx_mtop_t     &mtop,
-             const void           *commandStream);
+             const void           *commandStream,
+             GpuEventSynchronizer *xUpdatedOnDevice);
  
          ~Impl();
  
@@ -142,7 +150,7 @@ class UpdateConstrainCuda::Impl
  
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
-        void *getCoordinatesReadySync();
+        GpuEventSynchronizer* getCoordinatesReadySync();
  
      private:
  
@@ -184,8 +192,8 @@ class UpdateConstrainCuda::Impl
          //! SETTLE CUDA object for water constrains
          std::unique_ptr<SettleCuda>          settleCuda_;
  
-        //! An event to indicate when the update of coordinates is complete
-        GpuEventSynchronizer                 coordinatesReady_;
+        //! An pointer to the event to indicate when the update of coordinates is complete
+        GpuEventSynchronizer                *coordinatesReady_;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 8b4fcb48e8d494ebabbff10728a26370bf19bd10..b0fccfa1a7c588968b0c48a8e0608471155ed130 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -355,7 +355,7 @@ void gmx::LegacySimulator::do_md()
              GMX_LOG(mdlog.info).asParagraph().
                  appendText("Updating coordinates on the GPU.");
          }
-        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream());
+        integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice());
      }
  
      if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 5c20b44793ea21e01ed2ce770131a556ba74507e..13d00ab245ba4ba7a59820250a8400d798c107ae 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -53,6 +53,7 @@
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
  
@@ -173,13 +174,31 @@ class StatePropagatorDataGpu
          void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                    AtomLocality                    atomLocality);
  
-        /*! \brief Get the event synchronizer for the H2D coordinates copy.
+        /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
           *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         * Returns the event synchronizer which indicates that the coordinates are ready for the
+         * consumption on the device. Takes into account that the producer may be different.
+         *
+         * If the update is offloaded, and the current step is not a DD/search step, the returned
+         * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
+         * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
+         * returned synchronizer indicates that the copy is complete.
+         *
+         *  \param[in] atomLocality    Locality of the particles to wait for.
+         *  \param[in] simulationWork  The simulation lifetime flags.
+         *  \param[in] stepWork        The step lifetime flags.
           *
           *  \returns  The event to synchronize the stream that consumes coordinates on device.
           */
-        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                               const SimulationWorkload &simulationWork,
+                                                               const StepWorkload       &stepWork);
+
+        /*! \brief Getter for the event synchronizer for the update is done on th GPU
+         *
+         *  \returns  The event to synchronize the stream coordinates wre updated on device.
+         */
+        GpuEventSynchronizer* xUpdatedOnDevice();
  
          /*! \brief Copy positions from the GPU memory.
           *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 6160bdf5a0a49f6a8cf26b32343faf2533a658a2..1f6da3a5d7b0ad67f45130923bd05fbe81f228ff 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -88,7 +88,15 @@ DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
      return DeviceBuffer<float> {};
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              /* atomLocality   */,
+                                                                               const SimulationWorkload & /* simulationWork */,
+                                                                               const StepWorkload       & /* stepWork       */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
      GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
      return nullptr;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index cb15236738081934eb4571f8ded2e412b1da8e9e..b5fcffeb19ffcc429a99a5057fce781c14786747 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -160,13 +160,31 @@ class StatePropagatorDataGpu::Impl
          void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
                                    AtomLocality                    atomLocality);
  
-        /*! \brief Get the event synchronizer on the H2D coordinates copy.
+        /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device.
           *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         * Returns the event synchronizer which indicates that the coordinates are ready for the
+         * consumption on the device. Takes into account that the producer may be different.
+         *
+         * If the update is offloaded, and the current step is not a DD/search step, the returned
+         * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search
+         * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
+         * returned synchronizer indicates that the copy is complete.
+         *
+         *  \param[in] atomLocality    Locality of the particles to wait for.
+         *  \param[in] simulationWork  The simulation lifetime flags.
+         *  \param[in] stepWork        The step lifetime flags.
           *
           *  \returns  The event to synchronize the stream that consumes coordinates on device.
           */
-        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                               const SimulationWorkload &simulationWork,
+                                                               const StepWorkload       &stepWork);
+
+        /*! \brief Getter for the event synchronizer for the update is done on th GPU
+         *
+         *  \returns  The event to synchronize the stream coordinates wre updated on device.
+         */
+        GpuEventSynchronizer* xUpdatedOnDevice();
  
          /*! \brief Copy positions from the GPU memory.
           *
@@ -294,8 +312,13 @@ class StatePropagatorDataGpu::Impl
          // Streams to use for forces H2D and D2H copies (one event for each atom locality)
          EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = {{nullptr}};
  
-        //! An array of events that indicate H2D copy is complete (one event for each atom locality)
+        /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
+         *
+         * \todo Reconsider naming. It should be xCopiedToDevice or xH2DCopyComplete, etc.
+         */
          EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnDevice_;
+        //! An event that the coordinates are ready after update-constraints execution
+        GpuEventSynchronizer                                 xUpdatedOnDevice_;
          //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality)
          EnumerationArray<AtomLocality, GpuEventSynchronizer> xReadyOnHost_;
  
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 7070891d6c706b42352ae9ba0d2d8e8447b908a5..88bb6e5fedd84b99458c36f45342e6562b90bd6c 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -269,9 +269,30 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
      }
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                                                     const SimulationWorkload &simulationWork,
+                                                                                     const StepWorkload       &stepWork)
+{
+    // The provider of the coordinates may be different for local atoms. If the update is offloaded
+    // and this is not a neighbor search step, then the consumer needs to wait for the update
+    // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
+    // the copy event. Non-local coordinates are always provided by the H2D copy.
+    //
+    // TODO: This should be reconsidered to support the halo exchange.
+    //
+    if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
+    {
+        return &xUpdatedOnDevice_;
+    }
+    else
+    {
+        return &xReadyOnDevice_[atomLocality];
+    }
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
  {
-    return &xReadyOnDevice_[atomLocality];
+    return &xUpdatedOnDevice_;
  }
  
  void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
@@ -440,9 +461,16 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx:
      return impl_->copyCoordinatesToGpu(h_x, atomLocality);
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
+                                                                               const SimulationWorkload &simulationWork,
+                                                                               const StepWorkload       &stepWork)
+{
+    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
-    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality);
+    return impl_->xUpdatedOnDevice();
  }
  
  void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>  h_x,
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 9 Oct 2019 14:02:36 +0000 (16:02 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sat, 12 Oct 2019 11:18:07 +0000 (13:18 +0200)
src/gromacs/mdlib/update_constrain_cuda.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.h		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history