From 7073b54d9fbf8ddfe007309233a9b83943dee67c Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Tue, 22 Oct 2019 07:23:11 -0700 Subject: [PATCH] Event-based Dependency for GPU Force Halo Exchange Introduces new event recorded when exchanged forces are ready on GPU, and passes this into force buffer ops using dependencyList. Removes previous mechanism of forcing local stream to wait on non-local stream. Addresses part of #3093 Refs #3194 Change-Id: I768898839e5c6a653894d5eb80354f0e423e06ed --- src/gromacs/domdec/gpuhaloexchange.h | 4 ++++ src/gromacs/domdec/gpuhaloexchange_impl.cpp | 8 ++++++++ src/gromacs/domdec/gpuhaloexchange_impl.cu | 10 ++++++++++ src/gromacs/domdec/gpuhaloexchange_impl.cuh | 7 +++++++ src/gromacs/mdlib/sim_util.cpp | 11 ++--------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 10 ---------- src/gromacs/nbnxm/nbnxm.cpp | 5 ----- src/gromacs/nbnxm/nbnxm.h | 3 --- src/gromacs/nbnxm/nbnxm_gpu.h | 6 ------ 9 files changed, 31 insertions(+), 33 deletions(-) diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index f577124ada..b7e6ff54fa 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -110,6 +110,10 @@ public: */ void communicateHaloForces(bool accumulateForces); + /*! \brief Get the event synchronizer for the forces ready on device. + * \returns The event to synchronize the stream that consumes forces on device. + */ + GpuEventSynchronizer* getForcesReadyOnDeviceEvent(); private: class Impl; diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index 7ea94ac6a4..2511673218 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -96,6 +96,14 @@ void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces) "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); } +/*!\brief get forces ready on device event stub. */ +GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent() +{ + GMX_ASSERT(false, + "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); + return nullptr; +} + } // namespace gmx #endif /* GMX_GPU != GMX_GPU_CUDA */ diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 09061493b3..b1e743db28 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -289,6 +289,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); } + fReadyOnDevice_.markEvent(nonLocalStream_); } @@ -378,6 +379,11 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, #endif } +GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent() +{ + return &fReadyOnDevice_; +} + /*! \brief Create Domdec GPU object */ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) : dd_(dd), @@ -443,4 +449,8 @@ void GpuHaloExchange::communicateHaloForces(bool accumulateForces) impl_->communicateHaloForces(accumulateForces); } +GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent() +{ + return impl_->getForcesReadyOnDeviceEvent(); +} } // namespace gmx diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 9f4bf8cdd8..017cb19186 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -96,6 +96,11 @@ public: */ void communicateHaloForces(bool accumulateForces); + /*! \brief Get the event synchronizer for the forces ready on device. + * \returns The event to synchronize the stream that consumes forces on device. + */ + GpuEventSynchronizer* getForcesReadyOnDeviceEvent(); + private: /*! \brief Data transfer wrapper for GPU halo exchange * \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory @@ -177,6 +182,8 @@ private: float3* d_x_ = nullptr; //! full forces buffer in GPU memory float3* d_f_ = nullptr; + //! An event recorded once the exchanged forces are ready on the GPU + GpuEventSynchronizer fReadyOnDevice_; }; } // namespace gmx diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 6009b1a5a6..b52ec1b416 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1670,7 +1670,7 @@ void do_force(FILE* fplog, fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU : nullptr; // PME reduction not active on GPU - gmx::FixedCapacityVector dependencyList; + gmx::FixedCapacityVector dependencyList; if (stepWork.useGpuPmeFReduction) { @@ -1703,14 +1703,7 @@ void do_force(FILE* fplog, } if (useGpuForcesHaloExchange) { - // Add a stream synchronization to satisfy a dependency - // for the local buffer ops on the result of GPU halo - // exchange, which operates in the non-local stream and - // writes to to local parf og the force buffer. - // - // TODO improve this through use of an event - see Redmine #3093 - // push the event into the dependencyList - nbv->stream_local_wait_for_nonlocal(); + dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent()); } nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr, dependencyList, stepWork.useGpuPmeFReduction, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 4f2c54132f..45963daba6 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -945,14 +945,4 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo } } -void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb) -{ - cudaStream_t localStream = nb->stream[InteractionLocality::Local]; - cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal]; - - GpuEventSynchronizer event; - event.markEvent(nonLocalStream); - event.enqueueWaitEvent(localStream); -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 089c7f277f..1531f02816 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -245,9 +245,4 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocal Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); } -void nonbonded_verlet_t::stream_local_wait_for_nonlocal() -{ - Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv); -} - /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index fd11ec9de5..1548f3704f 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -353,9 +353,6 @@ public: /*! \brief return GPU pointer to f in rvec format */ void* get_gpu_frvec(); - /*! \brief Ensure local stream waits for non-local stream */ - void stream_local_wait_for_nonlocal(); - //! Return the kernel setup const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 423ec5b925..7a4a94b6a5 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -328,11 +328,5 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality, CUDA_FUNC_QUALIFIER void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM; -/*! \brief Ensure local stream waits for non-local stream - * \param[in] nb The nonbonded data GPU structure - */ -CUDA_FUNC_QUALIFIER -void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM; - } // namespace Nbnxm #endif -- 2.22.0