From 92d9b39b7ac25613052a6ba178d513dd142151ac Mon Sep 17 00:00:00 2001
From: Szilard Pall <pall.szilard@gmail.com>
Date: Mon, 29 Jun 2015 03:58:35 +0200
Subject: [PATCH] Fix CUDA inter-stream synchronization issue

With the introduction of multiple hardware queues in CC 3.5 and later
NVIDIA GPUs, the implicit dependency between tasks in the local and
non-local kernel got eliminated. However, as the misc_ops_done event
that we sync with in the non-local stream preceded the local coordinate
transfer, even though the tasks in the local stream are always issued
first, under rare circumstances the non-local kernel could start before
the local coordinate transfer completes. This would lead to non-local
interactions being calculated using coordinates (and charges) from the
previous step.

This change moves the synchronization point to creating a dependency
between the local coordinate transfer and non-local non-bonded kernel.

Change-Id: I0b3837d46db6469f6b1d9869a3a73b5176d93d99
---
 src/mdlib/nbnxn_cuda/nbnxn_cuda.cu           | 44 ++++++++++----------
 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu |  8 ++--
 src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h      |  2 +-
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 31be75dd60..047f56d21b 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -253,12 +253,13 @@ static inline int calc_shmem_required(int kver)
 
    These operations are issued in the local stream at the beginning of the step
    and therefore always complete before the local kernel launch. The non-local
-   kernel is launched after the local on the same device/context, so this is
+   kernel is launched after the local on the same device/context hence it is
    inherently scheduled after the operations in the local stream (including the
-   above "misc_ops").
-   However, for the sake of having a future-proof implementation, we use the
-   misc_ops_done event to record the point in time when the above  operations
-   are finished and synchronize with this event in the non-local stream.
+   above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
+   devices with multiple hardware queues the dependency needs to be enforced.
+   We use the misc_ops_and_local_H2D_done event to record the point where
+   the local x+q H2D (and all preceding) tasks are complete and synchronize
+   with this event in the non-local stream before launching the non-bonded kernel.
 */
 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
                               const nbnxn_atomdata_t *nbatom,
@@ -311,22 +312,6 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
         adat_len    = adat->natoms - adat->natoms_local;
     }
 
-    /* When we get here all misc operations issues in the local stream are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (cu_nb->bUseTwoStreams)
-    {
-        if (iloc == eintLocal)
-        {
-            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
-        }
-        else
-        {
-            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
-        }
-    }
-
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
@@ -338,6 +323,23 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
                       adat_len * sizeof(*adat->xq), stream); 
 
+    /* When we get here all misc operations issues in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one. */
+    if (cu_nb->bUseTwoStreams)
+    {
+        if (iloc == eintLocal)
+        {
+            stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream);
+            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+        }
+        else
+        {
+            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0);
+            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+        }
+    }
+
     if (bDoTime)
     {
         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index a35bfedb81..377dcd5071 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -576,8 +576,8 @@ void nbnxn_cuda_init(FILE *fplog,
     /* init events for sychronization (timing disabled for performance reasons!) */
     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
-    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+    stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
 
     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
      * (which increases with shorter time/step) caused by a known CUDA driver bug.
@@ -933,8 +933,8 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 
     stat = cudaEventDestroy(cu_nb->nonlocal_done);
     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-    stat = cudaEventDestroy(cu_nb->misc_ops_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+    stat = cudaEventDestroy(cu_nb->misc_ops_and_local_H2D_done);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
 
     if (cu_nb->bDoTime)
     {
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 53cebe4f25..a5cbdc0946 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -199,7 +199,7 @@ struct nbnxn_cuda
     cudaStream_t     stream[2];      /* local and non-local GPU streams                      */
 
     /* events used for synchronization */
-    cudaEvent_t    nonlocal_done, misc_ops_done;
+    cudaEvent_t    nonlocal_done, misc_ops_and_local_H2D_done;
 
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
-- 
2.22.0