From: Szilárd Páll Date: Tue, 26 Oct 2021 21:52:34 +0000 (+0200) Subject: Unify coordinate copy handling across GPU platforms X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=0d86b360f5da797960a7d5dc3520690439bbbc1f Unify coordinate copy handling across GPU platforms Instead of handling the OpenCL host to device copy as a special case where this operation is issued in the PME stream, as the "Update" stream is anyway created unconditionally it makes sense to simplify codepaths at the negligible cost of an extra GPU-side synchronization. This allows unifying the coordinate handling in stateGpu as well as in PME spread (with combined PP+PME) which removes some complexity and makes it easier to reason about event consumption, reducing the need for balancing workarounds. The change also introduces a data member in StatePropagatorDataGpu to indicate whether it is use on a separate PME rank, case in which marking of the coordinate H2D event can be skipped as no event dependency is required. Refs #3988 --- diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 680fcec0fc..3f8e00f995 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -198,8 +198,7 @@ void pme_gpu_launch_spread(gmx_pme_t* pme, gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); - GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode, - "Need a valid xReadyOnDevice on PP+PME ranks with CUDA."); + GMX_ASSERT(xReadyOnDevice || !pme->bPPnode, "Need a valid xReadyOnDevice on PP+PME ranks."); GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU."); PmeGpu* pmeGpu = pme->gpu; diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 35fff48b6a..5e5bfd8ebe 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1324,9 +1324,9 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, "inconsistent atom data padding vs. spreading block size"); // Ensure that coordinates are ready on the device before launching spread; - // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests - // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr). - GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly + // only needed on PP+PME ranks, not on separate PME ranks, in unit tests + // as these cases use a single stream (hence xReadyOnDevice == nullptr). + GMX_ASSERT(xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs, "Need a valid coordinate synchronizer on PP+PME ranks with CUDA."); diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index d72bc5a0f2..ec73a14d7a 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -419,6 +419,9 @@ private: //! \brief Pointer to wallcycle structure. gmx_wallcycle* wcycle_; + //! Whether this instance of the class is used on a PME-only rank + bool isPmeOnly_ = false; + /*! \brief Performs the copy of data from host to device buffer. * * \todo Template on locality. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 9bac56b216..4e3126efba 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -77,9 +77,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage pmeStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme); localStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal); nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal); - // PME stream is used in OpenCL for H2D coordinate transfer - updateStream_ = &deviceStreamManager.stream( - GMX_GPU_OPENCL ? DeviceStreamType::Pme : DeviceStreamType::UpdateAndConstraints); + updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints); // Map the atom locality to the stream that will be used for coordinates, // velocities and forces transfers. Same streams are used for H2D and D2H copies. @@ -120,6 +118,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream, nonLocalStream_ = nullptr; updateStream_ = nullptr; + isPmeOnly_ = true; // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests. // This it temporary measure to make it safe to use this class in those cases. @@ -330,11 +329,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef