From 92d9b39b7ac25613052a6ba178d513dd142151ac Mon Sep 17 00:00:00 2001 From: Szilard Pall Date: Mon, 29 Jun 2015 03:58:35 +0200 Subject: [PATCH] Fix CUDA inter-stream synchronization issue With the introduction of multiple hardware queues in CC 3.5 and later NVIDIA GPUs, the implicit dependency between tasks in the local and non-local kernel got eliminated. However, as the misc_ops_done event that we sync with in the non-local stream preceded the local coordinate transfer, even though the tasks in the local stream are always issued first, under rare circumstances the non-local kernel could start before the local coordinate transfer completes. This would lead to non-local interactions being calculated using coordinates (and charges) from the previous step. This change moves the synchronization point to creating a dependency between the local coordinate transfer and non-local non-bonded kernel. Change-Id: I0b3837d46db6469f6b1d9869a3a73b5176d93d99 --- src/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 44 ++++++++++---------- src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 8 ++-- src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h | 2 +- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 31be75dd60..047f56d21b 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -253,12 +253,13 @@ static inline int calc_shmem_required(int kver) These operations are issued in the local stream at the beginning of the step and therefore always complete before the local kernel launch. The non-local - kernel is launched after the local on the same device/context, so this is + kernel is launched after the local on the same device/context hence it is inherently scheduled after the operations in the local stream (including the - above "misc_ops"). - However, for the sake of having a future-proof implementation, we use the - misc_ops_done event to record the point in time when the above operations - are finished and synchronize with this event in the non-local stream. + above "misc_ops") on pre-GK110 devices with single hardware queue, but on later + devices with multiple hardware queues the dependency needs to be enforced. + We use the misc_ops_and_local_H2D_done event to record the point where + the local x+q H2D (and all preceding) tasks are complete and synchronize + with this event in the non-local stream before launching the non-bonded kernel. */ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb, const nbnxn_atomdata_t *nbatom, @@ -311,22 +312,6 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb, adat_len = adat->natoms - adat->natoms_local; } - /* When we get here all misc operations issues in the local stream are done, - so we record that in the local stream and wait for it in the nonlocal one. */ - if (cu_nb->bUseTwoStreams) - { - if (iloc == eintLocal) - { - stat = cudaEventRecord(cu_nb->misc_ops_done, stream); - CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed"); - } - else - { - stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0); - CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed"); - } - } - /* beginning of timed HtoD section */ if (bDoTime) { @@ -338,6 +323,23 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb, cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4, adat_len * sizeof(*adat->xq), stream); + /* When we get here all misc operations issues in the local stream as well as + the local xq H2D are done, + so we record that in the local stream and wait for it in the nonlocal one. */ + if (cu_nb->bUseTwoStreams) + { + if (iloc == eintLocal) + { + stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream); + CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed"); + } + else + { + stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0); + CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed"); + } + } + if (bDoTime) { stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream); diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index a35bfedb81..377dcd5071 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -576,8 +576,8 @@ void nbnxn_cuda_init(FILE *fplog, /* init events for sychronization (timing disabled for performance reasons!) */ stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming); CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed"); - stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming); - CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed"); + stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed"); /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead * (which increases with shorter time/step) caused by a known CUDA driver bug. @@ -933,8 +933,8 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb) stat = cudaEventDestroy(cu_nb->nonlocal_done); CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done"); - stat = cudaEventDestroy(cu_nb->misc_ops_done); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done"); + stat = cudaEventDestroy(cu_nb->misc_ops_and_local_H2D_done); + CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done"); if (cu_nb->bDoTime) { diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index 53cebe4f25..a5cbdc0946 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -199,7 +199,7 @@ struct nbnxn_cuda cudaStream_t stream[2]; /* local and non-local GPU streams */ /* events used for synchronization */ - cudaEvent_t nonlocal_done, misc_ops_done; + cudaEvent_t nonlocal_done, misc_ops_and_local_H2D_done; /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. -- 2.22.0