From 3fed39606056516812d4ba0e7ad25ae9a8305608 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Tue, 30 Jun 2015 00:32:52 +0200 Subject: [PATCH] Avoid GPU data race also with OpenCL Implements the same change to non-local stream synchronization as now used for CUDA. Fixes #1756 Change-Id: I720edc0951f97dcff0bd477084fff45a149f01d9 --- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp | 21 ++++++++++--------- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 6 +++--- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h | 10 +++++---- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index 344464350d..c7d0b8a623 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -451,31 +451,32 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, adat_len = adat->natoms - adat->natoms_local; } - /* When we get here all misc operations issues in the local stream are done, + /* beginning of timed HtoD section */ + + /* HtoD x, q */ + ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, + adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL); + + /* When we get here all misc operations issues in the local stream as well as + the local xq H2D are done, so we record that in the local stream and wait for it in the nonlocal one. */ if (nb->bUseTwoStreams) { if (iloc == eintLocal) { #ifdef CL_VERSION_1_2 - cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_done)); + cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done)); #else - cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done)); + cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done)); #endif assert(CL_SUCCESS == cl_error); } else { - sync_ocl_event(stream, &(nb->misc_ops_done)); + sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done)); } } - /* beginning of timed HtoD section */ - - /* HtoD x, q */ - ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4, - adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL); - if (plist->nsci == 0) { /* Don't launch an empty local kernel (is not allowed with OpenCL). diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index fcd6da8c52..f10d874d87 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -1065,10 +1065,10 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) clReleaseEvent(nb->nonlocal_done); nb->nonlocal_done = NULL; } - if (nb->misc_ops_done) + if (nb->misc_ops_and_local_H2D_done) { - clReleaseEvent(nb->misc_ops_done); - nb->misc_ops_done = NULL; + clReleaseEvent(nb->misc_ops_and_local_H2D_done); + nb->misc_ops_and_local_H2D_done = NULL; } /* Free timers and timings */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h index 2f3396416d..f6ee458292 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h @@ -297,10 +297,12 @@ struct gmx_nbnxn_ocl_t cl_command_queue stream[2]; /**< local and non-local GPU queues */ /** events used for synchronization */ - cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel - is done (and the local transfer can proceed) */ - cl_event misc_ops_done; /**< event triggered when the operations that precede the - main force calculations are done (e.g. buffer 0-ing) */ + cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel + is done (and the local transfer can proceed) */ + cl_event isc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in + the local stream that need to precede the + non-local force calculations are done + (e.g. f buffer 0-ing, local x/q H2D) */ cl_bool bDoTime; /**< True if event-based timing is enabled. */ cl_timers_t *timers; /**< OpenCL event-based timers. */ -- 2.22.0