Avoid GPU data race also with OpenCL
authorMark Abraham <mark.j.abraham@gmail.com>
Mon, 29 Jun 2015 22:32:52 +0000 (00:32 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 30 Jun 2015 10:37:57 +0000 (12:37 +0200)
Implements the same change to non-local stream synchronization as now
used for CUDA.

Fixes #1756

Change-Id: I720edc0951f97dcff0bd477084fff45a149f01d9

src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index 344464350d0831bd9ee4032aeed1f2c727ec2a1b..c7d0b8a623906c00cf8b9b94c0a8308937c91611 100644 (file)
@@ -451,31 +451,32 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
         adat_len    = adat->natoms - adat->natoms_local;
     }
 
-    /* When we get here all misc operations issues in the local stream are done,
+    /* beginning of timed HtoD section */
+
+    /* HtoD x, q */
+    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+
+    /* When we get here all misc operations issues in the local stream as well as
+       the local xq H2D are done,
        so we record that in the local stream and wait for it in the nonlocal one. */
     if (nb->bUseTwoStreams)
     {
         if (iloc == eintLocal)
         {
 #ifdef CL_VERSION_1_2
-            cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_done));
+            cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
 #else
-            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
+            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
 #endif
             assert(CL_SUCCESS == cl_error);
         }
         else
         {
-            sync_ocl_event(stream, &(nb->misc_ops_done));
+            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
         }
     }
 
-    /* beginning of timed HtoD section */
-
-    /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
-                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
-
     if (plist->nsci == 0)
     {
         /* Don't launch an empty local kernel (is not allowed with OpenCL).
index fcd6da8c52aab8b579cf44aeb8e05eeaed767527..f10d874d87b90f5720dd99463d3eada8ecfcc46b 100644 (file)
@@ -1065,10 +1065,10 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
         clReleaseEvent(nb->nonlocal_done);
         nb->nonlocal_done = NULL;
     }
-    if (nb->misc_ops_done)
+    if (nb->misc_ops_and_local_H2D_done)
     {
-        clReleaseEvent(nb->misc_ops_done);
-        nb->misc_ops_done = NULL;
+        clReleaseEvent(nb->misc_ops_and_local_H2D_done);
+        nb->misc_ops_and_local_H2D_done = NULL;
     }
 
     /* Free timers and timings */
index 2f3396416d0ee1b18b6c8b9930acf9450e6b8a81..f6ee45829229e765c638bc25ddce8074884db7bb 100644 (file)
@@ -297,10 +297,12 @@ struct gmx_nbnxn_ocl_t
     cl_command_queue    stream[2];      /**< local and non-local GPU queues                             */
 
     /** events used for synchronization */
-    cl_event    nonlocal_done;           /**< event triggered when the non-local non-bonded kernel
-                                              is done (and the local transfer can proceed)               */
-    cl_event    misc_ops_done;           /**< event triggered when the operations that precede the
-                                              main force calculations are done (e.g. buffer 0-ing)       */
+    cl_event nonlocal_done;              /**< event triggered when the non-local non-bonded kernel
+                                            is done (and the local transfer can proceed) */
+    cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
+                                            the local stream that need to precede the
+                                            non-local force calculations are done
+                                            (e.g. f buffer 0-ing, local x/q H2D) */
 
     cl_bool                     bDoTime; /**< True if event-based timing is enabled.                     */
     cl_timers_t                *timers;  /**< OpenCL event-based timers.                                 */