From: Artem Zhmurov Date: Wed, 9 Oct 2019 14:02:36 +0000 (+0200) Subject: Centralize management of coordinates ready on GPU event X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=679c2f07f9732370999fbc636e50ef97be8550d4;p=alexxy%2Fgromacs.git Centralize management of coordinates ready on GPU event This change shifts the ownership of the EventSynchronizer indicating that coordinates are available on the device from UpdateConstraints to StatePropagatorDataGpu. The latter now provides a getter that, based on workload flags, decides which synchronizer to return. If update is offloaded the returned event records the completion of the update-constraints on the GPU. When update is not offloaded and on search steps the returned event records the completion of the host to device coordinate transfer. Change-Id: Idec9f46e78a2708fa31e895bf7590cdad1872987 --- diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h index 9ab79720f6..3c4fc103f0 100644 --- a/src/gromacs/mdlib/update_constrain_cuda.h +++ b/src/gromacs/mdlib/update_constrain_cuda.h @@ -51,6 +51,8 @@ #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" +class GpuEventSynchronizer; + struct gmx_mtop_t; struct t_idef; struct t_inputrec; @@ -66,15 +68,23 @@ class UpdateConstrainCuda public: /*! \brief Create Update-Constrain object. * - * \param[in] ir Input record data: LINCS takes number of iterations and order of - * projection from it. - * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms - * and target O-H and H-H distances from this object. - * \param[in] commandStream GPU stream to use. Can be nullptr. + * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain + * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify + * coordinates. The event is maintained outside this class and also passed to all (if any) consumers + * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the + * markEvent(...) method is called unconditionally. + * + * \param[in] ir Input record data: LINCS takes number of iterations and order of + * projection from it. + * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms + * and target O-H and H-H distances from this object. + * \param[in] commandStream GPU stream to use. Can be nullptr. + * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. */ - UpdateConstrainCuda(const t_inputrec &ir, - const gmx_mtop_t &mtop, - const void *commandStream); + UpdateConstrainCuda(const t_inputrec &ir, + const gmx_mtop_t &mtop, + const void *commandStream, + GpuEventSynchronizer *xUpdatedOnDevice); ~UpdateConstrainCuda(); @@ -137,7 +147,7 @@ class UpdateConstrainCuda /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device. */ - void *getCoordinatesReadySync(); + GpuEventSynchronizer* getCoordinatesReadySync(); private: class Impl; diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp index 9e9f56e80f..647493ec16 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp @@ -57,7 +57,8 @@ class UpdateConstrainCuda::Impl UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec &ir, gmx_unused const gmx_mtop_t &mtop, - gmx_unused const void *commandStream) + gmx_unused const void *commandStream, + gmx_unused GpuEventSynchronizer *xUpdatedOnDevice) : impl_(nullptr) { GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); @@ -98,7 +99,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice() GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); } -void* UpdateConstrainCuda::getCoordinatesReadySync() +GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync() { GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); return nullptr; diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu index a58657b0e8..b146c83ca8 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu @@ -107,17 +107,21 @@ void UpdateConstrainCuda::Impl::integrate(const real dt, } } - coordinatesReady_.markEvent(commandStream_); + coordinatesReady_->markEvent(commandStream_); return; } -UpdateConstrainCuda::Impl::Impl(const t_inputrec &ir, - const gmx_mtop_t &mtop, - const void *commandStream) +UpdateConstrainCuda::Impl::Impl(const t_inputrec &ir, + const gmx_mtop_t &mtop, + const void *commandStream, + GpuEventSynchronizer *xUpdatedOnDevice) : + coordinatesReady_(xUpdatedOnDevice) { + GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr."); commandStream != nullptr ? commandStream_ = *static_cast(commandStream) : commandStream_ = nullptr; + integrator_ = std::make_unique(commandStream_); lincsCuda_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, commandStream_); settleCuda_ = std::make_unique(mtop, commandStream_); @@ -166,18 +170,19 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc) void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice() { - coordinatesReady_.waitForEvent(); + coordinatesReady_->waitForEvent(); } -void *UpdateConstrainCuda::Impl::getCoordinatesReadySync() +GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync() { - return static_cast (&coordinatesReady_); + return coordinatesReady_; } -UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir, - const gmx_mtop_t &mtop, - const void *commandStream) - : impl_(new Impl(ir, mtop, commandStream)) +UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir, + const gmx_mtop_t &mtop, + const void *commandStream, + GpuEventSynchronizer *xUpdatedOnDevice) + : impl_(new Impl(ir, mtop, commandStream, xUpdatedOnDevice)) { } @@ -218,7 +223,7 @@ void UpdateConstrainCuda::waitCoordinatesReadyOnDevice() impl_->waitCoordinatesReadyOnDevice(); } -void* UpdateConstrainCuda::getCoordinatesReadySync() +GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync() { return impl_->getCoordinatesReadySync(); } diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h index 31a05961e8..326578d12f 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.h +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h @@ -63,17 +63,25 @@ class UpdateConstrainCuda::Impl { public: - /*! \brief Create Update-Constrain object + /*! \brief Create Update-Constrain object. * - * \param[in] ir Input record data: LINCS takes number of iterations and order of - * projection from it. - * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms - * and target O-H and H-H distances from this object. - * \param[in] commandStream GPU stream to use. Can be nullptr. + * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain + * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify + * coordinates. The event is maintained outside this class and also passed to all (if any) consumers + * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the + * markEvent(...) method is called unconditionally. + * + * \param[in] ir Input record data: LINCS takes number of iterations and order of + * projection from it. + * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms + * and target O-H and H-H distances from this object. + * \param[in] commandStream GPU stream to use. Can be nullptr. + * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. */ Impl(const t_inputrec &ir, const gmx_mtop_t &mtop, - const void *commandStream); + const void *commandStream, + GpuEventSynchronizer *xUpdatedOnDevice); ~Impl(); @@ -142,7 +150,7 @@ class UpdateConstrainCuda::Impl /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device. */ - void *getCoordinatesReadySync(); + GpuEventSynchronizer* getCoordinatesReadySync(); private: @@ -184,8 +192,8 @@ class UpdateConstrainCuda::Impl //! SETTLE CUDA object for water constrains std::unique_ptr settleCuda_; - //! An event to indicate when the update of coordinates is complete - GpuEventSynchronizer coordinatesReady_; + //! An pointer to the event to indicate when the update of coordinates is complete + GpuEventSynchronizer *coordinatesReady_; }; } // namespace gmx diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 8b4fcb48e8..b0fccfa1a7 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -355,7 +355,7 @@ void gmx::LegacySimulator::do_md() GMX_LOG(mdlog.info).asParagraph(). appendText("Updating coordinates on the GPU."); } - integrator = std::make_unique(*ir, *top_global, fr->stateGpu->getUpdateStream()); + integrator = std::make_unique(*ir, *top_global, fr->stateGpu->getUpdateStream(), fr->stateGpu->xUpdatedOnDevice()); } if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 5c20b44793..13d00ab245 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -53,6 +53,7 @@ #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/vectypes.h" +#include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" @@ -173,13 +174,31 @@ class StatePropagatorDataGpu void copyCoordinatesToGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); - /*! \brief Get the event synchronizer for the H2D coordinates copy. + /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device. * - * \param[in] atomLocality Locality of the particles to wait for. + * Returns the event synchronizer which indicates that the coordinates are ready for the + * consumption on the device. Takes into account that the producer may be different. + * + * If the update is offloaded, and the current step is not a DD/search step, the returned + * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search + * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the + * returned synchronizer indicates that the copy is complete. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \param[in] simulationWork The simulation lifetime flags. + * \param[in] stepWork The step lifetime flags. * * \returns The event to synchronize the stream that consumes coordinates on device. */ - GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality); + GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality, + const SimulationWorkload &simulationWork, + const StepWorkload &stepWork); + + /*! \brief Getter for the event synchronizer for the update is done on th GPU + * + * \returns The event to synchronize the stream coordinates wre updated on device. + */ + GpuEventSynchronizer* xUpdatedOnDevice(); /*! \brief Copy positions from the GPU memory. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 6160bdf5a0..1f6da3a5d7 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -88,7 +88,15 @@ DeviceBuffer StatePropagatorDataGpu::getCoordinates() return DeviceBuffer {}; } -GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality /* atomLocality */) +GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality /* atomLocality */, + const SimulationWorkload & /* simulationWork */, + const StepWorkload & /* stepWork */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); + return nullptr; +} + +GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); return nullptr; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index cb15236738..b5fcffeb19 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -160,13 +160,31 @@ class StatePropagatorDataGpu::Impl void copyCoordinatesToGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); - /*! \brief Get the event synchronizer on the H2D coordinates copy. + /*! \brief Get the event synchronizer of the coordinates ready for the consumption on the device. * - * \param[in] atomLocality Locality of the particles to wait for. + * Returns the event synchronizer which indicates that the coordinates are ready for the + * consumption on the device. Takes into account that the producer may be different. + * + * If the update is offloaded, and the current step is not a DD/search step, the returned + * synchronizer indicates the completion of GPU update-constraint kernels. Otherwise, on search + * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the + * returned synchronizer indicates that the copy is complete. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \param[in] simulationWork The simulation lifetime flags. + * \param[in] stepWork The step lifetime flags. * * \returns The event to synchronize the stream that consumes coordinates on device. */ - GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality); + GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality, + const SimulationWorkload &simulationWork, + const StepWorkload &stepWork); + + /*! \brief Getter for the event synchronizer for the update is done on th GPU + * + * \returns The event to synchronize the stream coordinates wre updated on device. + */ + GpuEventSynchronizer* xUpdatedOnDevice(); /*! \brief Copy positions from the GPU memory. * @@ -294,8 +312,13 @@ class StatePropagatorDataGpu::Impl // Streams to use for forces H2D and D2H copies (one event for each atom locality) EnumerationArray fCopyStreams_ = {{nullptr}}; - //! An array of events that indicate H2D copy is complete (one event for each atom locality) + /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality) + * + * \todo Reconsider naming. It should be xCopiedToDevice or xH2DCopyComplete, etc. + */ EnumerationArray xReadyOnDevice_; + //! An event that the coordinates are ready after update-constraints execution + GpuEventSynchronizer xUpdatedOnDevice_; //! An array of events that indicate D2H copy of coordinates is complete (one event for each atom locality) EnumerationArray xReadyOnHost_; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 7070891d6c..88bb6e5fed 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -269,9 +269,30 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, @@ -440,9 +461,16 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRefcopyCoordinatesToGpu(h_x, atomLocality); } -GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality) +GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality, + const SimulationWorkload &simulationWork, + const StepWorkload &stepWork) +{ + return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork); +} + +GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice() { - return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality); + return impl_->xUpdatedOnDevice(); } void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef h_x,