Unify insertNonLocalDependency(...) function in NBNXM
authorArtem Zhmurov <zhmurov@gmail.com>
Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
committerArtem Zhmurov <zhmurov@gmail.com>
Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
Refs #2608

src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp

index 43bf518ad992132769122f39213962a630766e5c..6e3ad5cdfe1f87e7b39996a6b55218654ae629a4 100644 (file)
@@ -424,29 +424,6 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
     return shmem;
 }
 
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
 /*! As we execute nonbonded workload in separate streams, before launching
    the kernel we need to make sure that he following operations have completed:
    - atomdata allocation and related H2D transfers (every nstlist step);
index 3fa7ad9c779c9e223ee3436e6381082740afd0f4..bd056bcee8a840a817f9afcfe71b23d9f44f3be8 100644 (file)
@@ -68,6 +68,7 @@
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 
 #include "nbnxm_gpu.h"
@@ -429,6 +430,45 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality
     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 }
 
+inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
+{
+#if GMX_GPU_OPENCL
+    /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+     * in the stream after marking an event in it in order to be able to sync with
+     * the event from another stream.
+     */
+    cl_int cl_error = clFlush(deviceStream.stream());
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
+    }
+#endif
+}
+
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
+{
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
+            issueClFlushInStream(deviceStream);
+        }
+        else
+        {
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+        }
+    }
+}
+
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 {
index 8f3e58fdd6b729d84a4ceaa6de41eaed67d30419..af7f4ad86fd4c393c9cef9fb12bd8e747e5154e9 100644 (file)
@@ -489,37 +489,6 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
     nbparams_params->vdw_switch        = nbp->vdw_switch;
 }
 
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
 /*! \brief Launch GPU kernel
 
    As we execute nonbonded workload in separate queues, before launching
index 1f57c36bf66880bb114729207524d45eaed180dc..1a130a6fb5ed3dc3b13d6021e7f328246e598bc1 100644 (file)
 namespace Nbnxm
 {
 
-
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
-
 /*! \brief
  * Launch asynchronously the download of nonbonded forces from the GPU
  * (and energies/shift forces if required).