Fix CUDA inter-stream synchronization issue

author Szilard Pall <pall.szilard@gmail.com>

Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)

committer Szilard Pall <pall.szilard@gmail.com>

Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)
author Szilard Pall <pall.szilard@gmail.com>
Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)
committer Szilard Pall <pall.szilard@gmail.com>
Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 31be75dd604a559c0d4183df9f0c3b47cf41775c..047f56d21b569ca9b84e5f50e70060c4c5e94c7d 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -253,12 +253,13 @@ static inline int calc_shmem_required(int kver)
  
     These operations are issued in the local stream at the beginning of the step
     and therefore always complete before the local kernel launch. The non-local
-   kernel is launched after the local on the same device/context, so this is
+   kernel is launched after the local on the same device/context hence it is
     inherently scheduled after the operations in the local stream (including the
-   above "misc_ops").
-   However, for the sake of having a future-proof implementation, we use the
-   misc_ops_done event to record the point in time when the above  operations
-   are finished and synchronize with this event in the non-local stream.
+   above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
+   devices with multiple hardware queues the dependency needs to be enforced.
+   We use the misc_ops_and_local_H2D_done event to record the point where
+   the local x+q H2D (and all preceding) tasks are complete and synchronize
+   with this event in the non-local stream before launching the non-bonded kernel.
  */
  void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
                                const nbnxn_atomdata_t *nbatom,
@@ -311,22 +312,6 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
          adat_len    = adat->natoms - adat->natoms_local;
      }
  
-    /* When we get here all misc operations issues in the local stream are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (cu_nb->bUseTwoStreams)
-    {
-        if (iloc == eintLocal)
-        {
-            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
-        }
-        else
-        {
-            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
-        }
-    }
-
      /* beginning of timed HtoD section */
      if (bDoTime)
      {
@@ -338,6 +323,23 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
      cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
                        adat_len * sizeof(*adat->xq), stream); 
  
+    /* When we get here all misc operations issues in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one. */
+    if (cu_nb->bUseTwoStreams)
+    {
+        if (iloc == eintLocal)
+        {
+            stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream);
+            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+        }
+        else
+        {
+            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0);
+            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+        }
+    }
+
      if (bDoTime)
      {
          stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index a35bfedb818b393dc12c769ccf722fc62eef66be..377dcd50712491c41454bf2551e050e5cbdeb102 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -576,8 +576,8 @@ void nbnxn_cuda_init(FILE *fplog,
      /* init events for sychronization (timing disabled for performance reasons!) */
      stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
      CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
-    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+    stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
  
      /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
       * (which increases with shorter time/step) caused by a known CUDA driver bug.
@@ -933,8 +933,8 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
  
      stat = cudaEventDestroy(cu_nb->nonlocal_done);
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-    stat = cudaEventDestroy(cu_nb->misc_ops_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+    stat = cudaEventDestroy(cu_nb->misc_ops_and_local_H2D_done);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
  
      if (cu_nb->bDoTime)
      {
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index 53cebe4f258aabcb20322461205e6ef2ed013baf..a5cbdc094677d2c085e4237e0760bc649c0c7aef 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -199,7 +199,7 @@ struct nbnxn_cuda
      cudaStream_t     stream[2];      /* local and non-local GPU streams                      */
  
      /* events used for synchronization */
-    cudaEvent_t    nonlocal_done, misc_ops_done;
+    cudaEvent_t    nonlocal_done, misc_ops_and_local_H2D_done;
  
      /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
author	Szilard Pall <pall.szilard@gmail.com>
	Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)
committer	Szilard Pall <pall.szilard@gmail.com>
	Mon, 29 Jun 2015 01:58:35 +0000 (03:58 +0200)
src/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history