From: Artem Zhmurov Date: Thu, 11 Mar 2021 20:04:05 +0000 (+0000) Subject: Unify insertNonLocalDependency(...) function in NBNXM X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=50de4ce73b83ca968b7a30b86f1c7fa6598539cf;p=alexxy%2Fgromacs.git Unify insertNonLocalDependency(...) function in NBNXM Refs #2608 --- diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 43bf518ad9..6e3ad5cdfe 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -424,29 +424,6 @@ static inline int calc_shmem_required_nonbonded(const int num_thre return shmem; } -void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) -{ - const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; - - /* When we get here all misc operations issued in the local stream as well as - the local xq H2D are done, - so we record that in the local stream and wait for it in the nonlocal one. - This wait needs to precede any PP tasks, bonded or nonbonded, that may - compute on interactions between local and nonlocal atoms. - */ - if (nb->bUseTwoStreams) - { - if (interactionLocality == InteractionLocality::Local) - { - nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); - } - else - { - nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); - } - } -} - /*! As we execute nonbonded workload in separate streams, before launching the kernel we need to make sure that he following operations have completed: - atomdata allocation and related H2D transfers (every nstlist step); diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 3fa7ad9c77..bd056bcee8 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -68,6 +68,7 @@ #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" #include "nbnxm_gpu.h" @@ -429,6 +430,45 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); } +inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream) +{ +#if GMX_GPU_OPENCL + /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed + * in the stream after marking an event in it in order to be able to sync with + * the event from another stream. + */ + cl_int cl_error = clFlush(deviceStream.stream()); + if (cl_error != CL_SUCCESS) + { + GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error))); + } +#endif +} + +void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) +{ + const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; + + /* When we get here all misc operations issued in the local stream as well as + the local xq H2D are done, + so we record that in the local stream and wait for it in the nonlocal one. + This wait needs to precede any PP tasks, bonded or nonbonded, that may + compute on interactions between local and nonlocal atoms. + */ + if (nb->bUseTwoStreams) + { + if (interactionLocality == InteractionLocality::Local) + { + nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); + issueClFlushInStream(deviceStream); + } + else + { + nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); + } + } +} + /*! \brief Launch asynchronously the xq buffer host to device copy. */ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 8f3e58fdd6..af7f4ad86f 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -489,37 +489,6 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams nbparams_params->vdw_switch = nbp->vdw_switch; } -void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) -{ - const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; - - /* When we get here all misc operations issued in the local stream as well as - the local xq H2D are done, - so we record that in the local stream and wait for it in the nonlocal one. - This wait needs to precede any PP tasks, bonded or nonbonded, that may - compute on interactions between local and nonlocal atoms. - */ - if (nb->bUseTwoStreams) - { - if (interactionLocality == InteractionLocality::Local) - { - nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); - - /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed - * in the local stream in order to be able to sync with the above event - * from the non-local stream. - */ - cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream()); - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); - } - else - { - nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); - } - } -} - /*! \brief Launch GPU kernel As we execute nonbonded workload in separate queues, before launching diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index 1f57c36bf6..1a130a6fb5 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -51,24 +51,6 @@ namespace Nbnxm { - -void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) -{ - const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; - if (nb->bUseTwoStreams) - { - if (interactionLocality == InteractionLocality::Local) - { - nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); - } - else - { - nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); - } - } -} - - /*! \brief * Launch asynchronously the download of nonbonded forces from the GPU * (and energies/shift forces if required).