Unify insertNonLocalDependency(...) function in NBNXM

author Artem Zhmurov <zhmurov@gmail.com>

Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 43bf518ad992132769122f39213962a630766e5c..6e3ad5cdfe1f87e7b39996a6b55218654ae629a4 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -424,29 +424,6 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
      return shmem;
  }
  
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
  /*! As we execute nonbonded workload in separate streams, before launching
     the kernel we need to make sure that he following operations have completed:
     - atomdata allocation and related H2D transfers (every nstlist step);
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

index 3fa7ad9c779c9e223ee3436e6381082740afd0f4..bd056bcee8a840a817f9afcfe71b23d9f44f3be8 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -68,6 +68,7 @@
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  
  #include "nbnxm_gpu.h"
@@ -429,6 +430,45 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality
      return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
  }
  
+inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
+{
+#if GMX_GPU_OPENCL
+    /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+     * in the stream after marking an event in it in order to be able to sync with
+     * the event from another stream.
+     */
+    cl_int cl_error = clFlush(deviceStream.stream());
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
+    }
+#endif
+}
+
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
+{
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
+            issueClFlushInStream(deviceStream);
+        }
+        else
+        {
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+        }
+    }
+}
+
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
  void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
  {
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 8f3e58fdd6b729d84a4ceaa6de41eaed67d30419..af7f4ad86fd4c393c9cef9fb12bd8e747e5154e9 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -489,37 +489,6 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
      nbparams_params->vdw_switch        = nbp->vdw_switch;
  }
  
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-
-    /* When we get here all misc operations issued in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one.
-       This wait needs to precede any PP tasks, bonded or nonbonded, that may
-       compute on interactions between local and nonlocal atoms.
-     */
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
  /*! \brief Launch GPU kernel
  
     As we execute nonbonded workload in separate queues, before launching
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp

index 1f57c36bf66880bb114729207524d45eaed180dc..1a130a6fb5ed3dc3b13d6021e7f328246e598bc1 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -51,24 +51,6 @@
  namespace Nbnxm
  {
  
-
-void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
-{
-    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
-    if (nb->bUseTwoStreams)
-    {
-        if (interactionLocality == InteractionLocality::Local)
-        {
-            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
-        }
-        else
-        {
-            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
-        }
-    }
-}
-
-
  /*! \brief
   * Launch asynchronously the download of nonbonded forces from the GPU
   * (and energies/shift forces if required).
author	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 11 Mar 2021 20:04:05 +0000 (20:04 +0000)
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp		patch \| blob \| history