These operations are issued in the local stream at the beginning of the step
and therefore always complete before the local kernel launch. The non-local
- kernel is launched after the local on the same device/context, so this is
+ kernel is launched after the local on the same device/context hence it is
inherently scheduled after the operations in the local stream (including the
- above "misc_ops").
- However, for the sake of having a future-proof implementation, we use the
- misc_ops_done event to record the point in time when the above operations
- are finished and synchronize with this event in the non-local stream.
+ above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
+ devices with multiple hardware queues the dependency needs to be enforced.
+ We use the misc_ops_and_local_H2D_done event to record the point where
+ the local x+q H2D (and all preceding) tasks are complete and synchronize
+ with this event in the non-local stream before launching the non-bonded kernel.
*/
void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
adat_len = adat->natoms - adat->natoms_local;
}
- /* When we get here all misc operations issues in the local stream are done,
- so we record that in the local stream and wait for it in the nonlocal one. */
- if (cu_nb->bUseTwoStreams)
- {
- if (iloc == eintLocal)
- {
- stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
- CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
- }
- else
- {
- stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
- }
- }
-
/* beginning of timed HtoD section */
if (bDoTime)
{
cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
adat_len * sizeof(*adat->xq), stream);
+ /* When we get here all misc operations issues in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one. */
+ if (cu_nb->bUseTwoStreams)
+ {
+ if (iloc == eintLocal)
+ {
+ stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream);
+ CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+ }
+ else
+ {
+ stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+ }
+ }
+
if (bDoTime)
{
stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
/* init events for sychronization (timing disabled for performance reasons!) */
stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
- stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
- CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+ stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
/* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
* (which increases with shorter time/step) caused by a known CUDA driver bug.
stat = cudaEventDestroy(cu_nb->nonlocal_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
- stat = cudaEventDestroy(cu_nb->misc_ops_done);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+ stat = cudaEventDestroy(cu_nb->misc_ops_and_local_H2D_done);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
if (cu_nb->bDoTime)
{