adat_len = adat->natoms - adat->natoms_local;
}
- /* When we get here all misc operations issues in the local stream are done,
+ /* beginning of timed HtoD section */
+
+ /* HtoD x, q */
+ ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
+ adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+
+ /* When we get here all misc operations issues in the local stream as well as
+ the local xq H2D are done,
so we record that in the local stream and wait for it in the nonlocal one. */
if (nb->bUseTwoStreams)
{
if (iloc == eintLocal)
{
#ifdef CL_VERSION_1_2
- cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_done));
+ cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
#else
- cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
+ cl_error = clEnqueueMarker(stream, &(nb->misc_ops_and_local_H2D_done));
#endif
assert(CL_SUCCESS == cl_error);
}
else
{
- sync_ocl_event(stream, &(nb->misc_ops_done));
+ sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
}
}
- /* beginning of timed HtoD section */
-
- /* HtoD x, q */
- ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
- adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
-
if (plist->nsci == 0)
{
/* Don't launch an empty local kernel (is not allowed with OpenCL).