Use GpuEventSynchronizer in NBNXM

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh

index b098f3549857c9394ba80c003cdb0ac2c103fb48..f40c7d446ff45729b452a04d40662af0e6822b75 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -114,6 +114,8 @@ public:
          GMX_ASSERT(stat == cudaSuccess,
                     ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
+    //! Reset the event (not needed in CUDA)
+    inline void reset() {}
  
  private:
      cudaEvent_t event_;
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h

index 2b42308e841006f07c5c8a96d26e329944d61018..0b9905450fd4014e1d70827621c96e63395d55ce 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
@@ -106,7 +106,7 @@ public:
                                           + ocl_get_error_string(clError)));
          }
  
-        releaseEvent();
+        reset();
      }
      /*! \brief Checks the completion of the underlying event and resets the object if it was. */
      inline bool isReady()
@@ -121,7 +121,7 @@ public:
          bool hasTriggered = (result == CL_COMPLETE);
          if (hasTriggered)
          {
-            releaseEvent();
+            reset();
          }
          return hasTriggered;
      }
@@ -139,11 +139,11 @@ public:
                                           + ocl_get_error_string(clError)));
          }
  
-        releaseEvent();
+        reset();
      }
  
-private:
-    inline void releaseEvent()
+    //! Reset (release) the event to unmarked state.
+    inline void reset()
      {
          cl_int clError = clReleaseEvent(event_);
          if (CL_SUCCESS != clError)
@@ -154,6 +154,7 @@ private:
          event_ = nullptr;
      }
  
+private:
      cl_event event_;
  };
  
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h

index 6740cc3a3dad8d2fed5ecdd6012baee1e0a98c03..cdbed7cc1b972665edd49cbfeb9425ddc0f26df6 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
@@ -131,6 +131,8 @@ public:
          deviceStream.stream().submit_barrier(waitlist);
          event_.reset();
      }
+    //! Reset the event to unmarked state.
+    inline void reset() { event_.reset(); }
  
  private:
      std::optional<cl::sycl::event> event_ = std::nullopt;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index d269ac27bb94bd861824043e272a72f77f4a4d39..aa5652011b1dbeb1939990919a606cfe16827b71 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -424,14 +424,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
      return shmem;
  }
  
-/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
- *
- *  As the point where the local stream tasks can be considered complete happens
- *  at the same call point where the nonlocal stream should be synced with the
- *  the local, this function records the event if called with the local stream as
- *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
- */
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
  {
      const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
  
@@ -445,14 +438,11 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal
      {
          if (interactionLocality == InteractionLocality::Local)
          {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
          }
          else
          {
-            cudaError_t stat =
-                    cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
          }
      }
  }
@@ -462,9 +452,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
-    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
-               "Only local and non-local xq transfers are supported");
-
      const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
  
      int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
@@ -489,6 +476,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      {
          plist->haveFreshList = false;
  
+        // The event is marked for Local interactions unconditionally,
+        // so it has to be released here because of the early return
+        // for NonLocal interactions.
+        nb->misc_ops_and_local_H2D_done.reset();
+
          return;
      }
  
@@ -809,11 +801,14 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
-    cudaError_t stat;
-    int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
      /* determine interaction locality from atom locality */
      const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
  
      /* extract the data */
      cu_atomdata_t*      adat         = nb->atdat;
@@ -824,6 +819,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
+        nb->bNonLocalStreamDoneMarked = false;
          return;
      }
  
@@ -837,10 +833,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
  
      /* With DD the local D2H transfer can only start after the non-local
         kernel has finished. */
-    if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
      {
-        stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
-        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
      }
  
      /* DtoH f
@@ -866,8 +862,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         back first. */
      if (iloc == InteractionLocality::NonLocal)
      {
-        stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
-        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
      }
  
      /* only transfer energies in the local stream */
@@ -985,7 +981,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
      }
  
-    // TODO: note that this is not necessary when there astreamre no local atoms, that is:
+    // TODO: note that this is not necessary when there are no local atoms, that is:
      // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
      // but for now we avoid that optimization
      nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index e442fde24f21b13dc7494b745edbb8d609939531..7d1334144fc4342db4db557226593b4ef4ae56bf 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -182,8 +182,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
                     const nbnxn_atomdata_t*         nbat,
                     bool                            bLocalAndNonlocal)
  {
-    cudaError_t stat;
-
      auto nb            = new NbnxmGpu();
      nb->deviceContext_ = &deviceStreamManager.context();
      snew(nb->atdat, 1);
@@ -227,14 +225,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
          ;
      }
  
-    /* init events for sychronization (timing disabled for performance reasons!) */
-    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
-    stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
-
-    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
-
      /* WARNING: CUDA timings are incorrect with multiple streams.
       *          This is the main reason why they are disabled by default.
       */
@@ -409,7 +399,6 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  
  void gpu_free(NbnxmGpu* nb)
  {
-    cudaError_t    stat;
      cu_atomdata_t* atdat;
      NBParamGpu*    nbparam;
  
@@ -427,11 +416,6 @@ void gpu_free(NbnxmGpu* nb)
          destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
      }
  
-    stat = cudaEventDestroy(nb->nonlocal_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-    stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
-
      delete nb->timers;
  
      if (!useLjCombRule(nb->nbparam->vdwType))
@@ -610,14 +594,19 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
          }
      }
  
-    // The above data is transferred on the local stream but is a
-    // dependency of the nonlocal stream (specifically the nonlocal X
-    // buf ops kernel).  We therefore set a dependency to ensure
-    // that the nonlocal stream waits on the local stream here.
-    // This call records an event in the local stream:
-    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
-    // ...and this call instructs the nonlocal stream to wait on that event:
-    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+    if (gpu_nbv->bUseTwoStreams)
+    {
+        // The above data is transferred on the local stream but is a
+        // dependency of the nonlocal stream (specifically the nonlocal X
+        // buf ops kernel).  We therefore set a dependency to ensure
+        // that the nonlocal stream waits on the local stream here.
+        // This call records an event in the local stream:
+        gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
+                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
+        // ...and this call instructs the nonlocal stream to wait on that event:
+        gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
+                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
+    }
  
      return;
  }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index 7c92a1abdc648ab403a495a0687ce796a3e19c66..1cd1606de36903a28a3f76921ed5336870f7c66e 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,6 +50,7 @@
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/nbnxm/gpu_types_common.h"
@@ -142,8 +143,6 @@ struct cu_atomdata
   */
  typedef struct Nbnxm::gpu_timers_t cu_timers_t;
  
-class GpuEventSynchronizer;
-
  /*! \internal
   * \brief Main data structure for CUDA nonbonded force calculations.
   */
@@ -156,6 +155,9 @@ struct NbnxmGpu
      const DeviceContext* deviceContext_;
      /*! \brief true if doing both local/non-local NB work on GPU */
      bool bUseTwoStreams = false;
+    //! true indicates that the nonlocal_done event was marked
+    bool bNonLocalStreamDoneMarked = false;
+
      /*! \brief atom data */
      cu_atomdata_t* atdat = nullptr;
      /*! \brief array of atom indices */
@@ -185,18 +187,15 @@ struct NbnxmGpu
      /*! \brief local and non-local GPU streams */
      gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
  
-    /*! \brief Events used for synchronization */
-    /*! \{ */
      /*! \brief Event triggered when the non-local non-bonded
       * kernel is done (and the local transfer can proceed) */
-    cudaEvent_t nonlocal_done = nullptr;
+    GpuEventSynchronizer nonlocal_done;
      /*! \brief Event triggered when the tasks issued in the local
       * stream that need to precede the non-local force or buffer
       * operation calculations are done (e.g. f buffer 0-ing, local
       * x/q H2D, buffer op initialization in local stream that is
       * required also by nonlocal stream ) */
-    cudaEvent_t misc_ops_and_local_H2D_done = nullptr;
-    /*! \} */
+    GpuEventSynchronizer misc_ops_and_local_H2D_done;
  
      /*! \brief True if there is work for the current domain in the
       * respective locality.
@@ -208,10 +207,6 @@ struct NbnxmGpu
       * will be true. */
      gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
  
-    /*! \brief Event triggered when non-local coordinate buffer
-     * has been copied from device to host. */
-    GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
-
      /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
       * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 97827a636781777822689264de2a92e6fa71c419..96714bb15454d7397a23e8f0fd4acd0b4640aaa6 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -249,9 +249,4 @@ void nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu() const
      Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
  }
  
-void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocality interactionLocality) const
-{
-    Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
-}
-
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index b7c3101799d1e555ef80900b0d50131e2c1f157e..960e6c2b395bfaa2856de78bc9f5c759d08a5843 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -345,9 +345,6 @@ public:
      //! Init for GPU version of setup coordinates in Nbnxm
      void atomdata_init_copy_x_to_nbat_x_gpu() const;
  
-    //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
-    void insertNonlocalGpuDependency(gmx::InteractionLocality interactionLocality) const;
-
      //! Returns a reference to the pairlist sets
      const PairlistSets& pairlistSets() const { return *pairlistSets_; }
  
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 034b0f656243b63727b0eba1c4825d17a2359121..fe836969347c3e33d2f141f3c3923eb548969efc 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -260,12 +260,18 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                             int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
  
  /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ *  As the point where the local stream tasks can be considered complete happens
+ *  at the same call point where the nonlocal stream should be synced with the
+ *  the local, this function records the event if called with the local stream as
+ *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ *
   * \param[in] nb                   The nonbonded data GPU structure
   * \param[in] interactionLocality  Local or NonLocal sync point
   */
-CUDA_FUNC_QUALIFIER
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu gmx_unused* nb,
-                                      gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM;
+GPU_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu gmx_unused* nb,
+                                      gmx::InteractionLocality gmx_unused interactionLocality) GPU_FUNC_TERM;
  
  /*! \brief Set up internal flags that indicate what type of short-range work there is.
   *
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 165079bce4cdef4d71976a283faf4f9b0e2f4ceb..16cd265d84a95c66d13760bbed60a51d74e95caf 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -489,24 +489,35 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
      nbparams_params->vdw_switch        = nbp->vdw_switch;
  }
  
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
  {
-    cl_int gmx_unused cl_error;
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
  
-    /* Enqueue wait */
-    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
  
-    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
-    cl_error = clReleaseEvent(*ocl_event);
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
-    *ocl_event = nullptr;
+            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+             * in the local stream in order to be able to sync with the above event
+             * from the non-local stream.
+             */
+            cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
+            GMX_ASSERT(cl_error == CL_SUCCESS,
+                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
+        }
+        else
+        {
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+        }
+    }
  }
  
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
@@ -539,6 +550,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      {
          plist->haveFreshList = false;
  
+        // The event is marked for Local interactions unconditionally,
+        // so it has to be released here because of the early return
+        // for NonLocal interactions.
+        nb->misc_ops_and_local_H2D_done.reset();
+
          return;
      }
  
@@ -576,31 +592,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
          t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
      }
  
-    /* When we get here all misc operations issues in the local stream as well as
+    /* When we get here all misc operations issued in the local stream as well as
         the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
-        }
-    }
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
  }
  
  
@@ -944,6 +942,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
  
      /* determine interaction locality from atom locality */
      const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
  
      cl_atomdata_t*      adat         = nb->atdat;
      cl_timers_t*        t            = nb->timers;
@@ -955,13 +957,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      {
          /* TODO An alternative way to signal that non-local work is
             complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamActive has the
+           pair. However, the use of bNonLocalStreamDoneMarked has the
             advantage of being local to the host, so probably minimizes
             overhead. Curiously, for NVIDIA OpenCL with an empty-domain
             test case, overall simulation performance was higher with
             the API calls, but this has not been tested on AMD OpenCL,
             so could be worth considering in future. */
-        nb->bNonLocalStreamActive = CL_FALSE;
+        nb->bNonLocalStreamDoneMarked = false;
          return;
      }
  
@@ -975,9 +977,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
  
      /* With DD the local D2H transfer can only start after the non-local
         has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
      {
-        sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
      }
  
      /* DtoH f */
@@ -1001,10 +1004,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         data back first. */
      if (iloc == InteractionLocality::NonLocal)
      {
-        cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
-        GMX_ASSERT(cl_error == CL_SUCCESS,
-                   ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-        nb->bNonLocalStreamActive = CL_TRUE;
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
      }
  
      /* only transfer energies in the local stream */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index a70b2b8a71bb47a8a3526b500648c7f22150ff6d..7472b37c70e45defb390cd77eac14c5418d8145b 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -633,18 +633,6 @@ void gpu_free(NbnxmGpu* nb)
      pfree(nb->nbst.fshift);
      nb->nbst.fshift = nullptr;
  
-    /* Free other events */
-    if (nb->nonlocal_done)
-    {
-        clReleaseEvent(nb->nonlocal_done);
-        nb->nonlocal_done = nullptr;
-    }
-    if (nb->misc_ops_and_local_H2D_done)
-    {
-        clReleaseEvent(nb->misc_ops_and_local_H2D_done);
-        nb->misc_ops_and_local_H2D_done = nullptr;
-    }
-
      freeGpuProgram(nb->dev_rundata->program);
      delete nb->dev_rundata;
  
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index d1adddb205b14d4b899ad299aa18bbdc0492e914..2e7098227dbaeb3d7f4ade34a9e0a42ab2f7341d 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@
  
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/mdtypes/interaction_const.h"
@@ -240,8 +241,8 @@ struct NbnxmGpu
  
      //! true if doing both local/non-local NB work on GPU
      bool bUseTwoStreams = false;
-    //! true indicates that the nonlocal_done event was enqueued
-    bool bNonLocalStreamActive = false;
+    //! true indicates that the nonlocal_done event was marked
+    bool bNonLocalStreamDoneMarked = false;
  
      //! atom data
      cl_atomdata_t* atdat = nullptr;
@@ -259,13 +260,13 @@ struct NbnxmGpu
      /*! \{ */
      /*! \brief Event triggered when the non-local non-bonded
       * kernel is done (and the local transfer can proceed) */
-    cl_event nonlocal_done = nullptr;
+    GpuEventSynchronizer nonlocal_done;
      /*! \brief Event triggered when the tasks issued in the local
       * stream that need to precede the non-local force or buffer
       * operation calculations are done (e.g. f buffer 0-ing, local
       * x/q H2D, buffer op initialization in local stream that is
       * required also by nonlocal stream ) */
-    cl_event misc_ops_and_local_H2D_done = nullptr;
+    GpuEventSynchronizer misc_ops_and_local_H2D_done;
      /*! \} */
  
      //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 17 Feb 2021 10:24:31 +0000 (10:24 +0000)
src/gromacs/gpu_utils/gpueventsynchronizer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h		patch \| blob \| history