From 8a0d4d971981a7c9a36e71873284cf8d2cb05129 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 10 Oct 2019 22:20:40 +0200 Subject: [PATCH] Enable StatePropagatorGpuData for force transfers Force transfers have been switched to use StatePropagatorGpuData already before. This change updates the synchronization mechanisms as: - replaces the previous stream sync after GPU buffer/ops reduction with a waitForcesReadyOnHost call; - removes the barriers in copyForces[From|To]Gpu() as dependencies are now satisfied: most dependencies are intra-stream and therefore implicit, the exception being the halo exchange that uses its own mechanism to sync H2D in the local stream with the nonlocal stream (which is yet to be replaces Refs #3093). Refs. #3126. Change-Id: I8bfd39f79c87f20492c4ae287d6f19261724f806 --- src/gromacs/mdlib/sim_util.cpp | 5 ++--- .../mdtypes/state_propagator_data_gpu_impl_gpu.cpp | 4 ---- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 13 ------------- src/gromacs/nbnxm/nbnxm.cpp | 5 ----- src/gromacs/nbnxm/nbnxm.h | 6 ------ src/gromacs/nbnxm/nbnxm_gpu.h | 5 ----- 6 files changed, 2 insertions(+), 36 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 32c30aab8f..4c8323444d 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1573,7 +1573,7 @@ void do_force(FILE *fplog, { if (useGpuFBufOps == BufferOpsUseGpu::True) { - nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal); + stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle); } @@ -1691,9 +1691,8 @@ void do_force(FILE *fplog, pme_gpu_get_device_f(fr->pmedata), dependencyList, stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer); - // This function call synchronizes the local stream - nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local); stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::Local); } else { diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 2f66ea4d3e..5aa64f6e25 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -371,8 +371,6 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality."); copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream); - // TODO: Remove When event-based synchronization is introduced - gpuStreamSynchronize(commandStream); fReadyOnHost_[atomLocality].markEvent(commandStream); } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 23502b3fbe..e6cd8253b7 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -876,19 +876,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomL } -void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality, - gmx_nbnxn_gpu_t *nb) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - - cudaStream_t stream = nb->stream[iLocality]; - - cudaStreamSynchronize(stream); - -} - void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb) { return static_cast (nb->xAvailableOnDevice); diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 0ec882f3b7..4fb8cbacda 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -260,11 +260,6 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); } -void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality) -{ - nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv); -} - void* nonbonded_verlet_t::get_x_on_device_event() { return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 2f25f701cb..a5ef5baff3 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -360,12 +360,6 @@ struct nonbonded_verlet_t /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ void atomdata_init_add_nbat_f_to_f_gpu(); - /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete - * - * FIXME: need more details: when should be called / after which operation, etc. - */ - void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality); - /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */ void* get_x_on_device_event(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index ccb2aa5bde..afdafb80ff 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -319,11 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_un bool gmx_unused useGpuFPmeReduction, bool gmx_unused accumulateForce) CUDA_FUNC_TERM; -/*! \brief Wait for GPU stream to complete */ -CUDA_FUNC_QUALIFIER -void nbnxn_wait_for_gpu_force_reduction(AtomLocality gmx_unused atomLocality, - gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; - /*! \brief sync CPU thread on coordinate copy to device * \param[in] nb The nonbonded data GPU structure */ -- 2.22.0