From 7b77ac03f6364c62b746c24983611d9dbef23b98 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 10 Oct 2019 18:03:11 +0200 Subject: [PATCH] Centralize management of forces ready on GPU event This change adds the GpuEventSynchronizer for the forces reduced on GPU event to the StatePropagatorDataGpu. This event should be marked if the buffer ops are offloaded when the force reduction is done. The consumers of of the forces on the GPU will get this event or the event on the H2D copy is done, depending on the current step workload and offload scenario. Change-Id: Ib559dbed5ad777eac3a906e4ee0ebaa07caf0ac1 --- src/gromacs/mdlib/sim_util.cpp | 6 +++-- .../mdtypes/state_propagator_data_gpu.h | 21 ++++++++++++--- .../state_propagator_data_gpu_impl.cpp | 11 ++++++-- .../mdtypes/state_propagator_data_gpu_impl.h | 23 +++++++++++++--- .../state_propagator_data_gpu_impl_gpu.cpp | 26 ++++++++++++++++--- 5 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 57ff728da0..8386e34139 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1506,7 +1506,8 @@ void do_force(FILE *fplog, if (haveNonLocalForceContribInCpuBuffer) { stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); - dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal)); + dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal, + useGpuFBufOps == BufferOpsUseGpu::True)); } nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal, @@ -1660,7 +1661,8 @@ void do_force(FILE *fplog, if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange) { stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); - dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local)); + dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local, + useGpuFBufOps == BufferOpsUseGpu::True)); } if (useGpuForcesHaloExchange) { diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 13d00ab245..086c43fd4e 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -266,13 +266,28 @@ class StatePropagatorDataGpu void copyForcesToGpu(gmx::ArrayRef h_f, AtomLocality atomLocality); - /*! \brief Get the event synchronizer for the H2D forces copy. + /*! \brief Get the event synchronizer for the forces ready on device. * - * \param[in] atomLocality Locality of the particles to wait for. + * Returns either of the event synchronizers, depending on the offload scenario + * for the current simulation timestep: + * 1. The forces are copied to the device (when GPU buffer ops are off) + * 2. The forces are reduced on the device (GPU buffer ops are on) + * + * \todo Pass step workload instead of the useGpuFBufferOps boolean. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU. * * \returns The event to synchronize the stream that consumes forces on device. */ - GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality); + GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, + bool useGpuFBufferOps); + + /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU. + * + * \returns The event to mark when forces are reduced on the GPU. + */ + GpuEventSynchronizer* fReducedOnDevice(); /*! \brief Copy forces from the GPU memory. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 1f6da3a5d7..68d16ccd3b 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -162,9 +162,16 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality); - /*! \brief Get the event synchronizer on the H2D forces copy. + /*! \brief Get the event synchronizer for the forces ready on device. * - * \param[in] atomLocality Locality of the particles to wait for. + * Returns either of the event synchronizers, depending on the offload scenario + * for the current simulation timestep: + * 1. The forces are copied to the device (when GPU buffer ops are off) + * 2. The forces are reduced on the device (GPU buffer ops are on) + * + * \todo Pass step workload instead of the useGpuFBufferOps boolean. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU. * * \returns The event to synchronize the stream that consumes forces on device. */ - GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality); + GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, + bool useGpuFBufferOps); + + /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU. + * + * \returns The event to mark when forces are reduced on the GPU. + */ + GpuEventSynchronizer* fReducedOnDevice(); /*! \brief Copy forces from the GPU memory. * @@ -329,6 +344,8 @@ class StatePropagatorDataGpu::Impl //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality) EnumerationArray fReadyOnDevice_; + //! An event that the forces were reduced on the GPU + GpuEventSynchronizer fReducedOnDevice_; //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality) EnumerationArray fReadyOnHost_; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 88bb6e5fed..2f66ea4d3e 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -376,11 +376,23 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality) @@ -524,9 +536,15 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRefcopyForcesToGpu(h_f, atomLocality); } -GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality) +GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality, + bool useGpuFBufferOps) +{ + return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps); +} + +GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice() { - return impl_->getForcesReadyOnDeviceEvent(atomLocality); + return impl_->fReducedOnDevice(); } void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef h_f, -- 2.22.0