From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 17 Feb 2021 10:24:31 +0000 (+0000)
Subject: Use GpuEventSynchronizer in NBNXM
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=21c93104cd8b2f6da24b8037e3aad995211a1a04;p=alexxy%2Fgromacs.git

Use GpuEventSynchronizer in NBNXM

This unifies the event-based synchronization in CUDA and OpenCL
by using a wrapper with platform-agnostic interfaces for events.

Refs #2608
---

diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
index b098f35498..f40c7d446f 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -114,6 +114,8 @@ public:
         GMX_ASSERT(stat == cudaSuccess,
                    ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
+    //! Reset the event (not needed in CUDA)
+    inline void reset() {}
 
 private:
     cudaEvent_t event_;
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
index 2b42308e84..0b9905450f 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h
@@ -106,7 +106,7 @@ public:
                                          + ocl_get_error_string(clError)));
         }
 
-        releaseEvent();
+        reset();
     }
     /*! \brief Checks the completion of the underlying event and resets the object if it was. */
     inline bool isReady()
@@ -121,7 +121,7 @@ public:
         bool hasTriggered = (result == CL_COMPLETE);
         if (hasTriggered)
         {
-            releaseEvent();
+            reset();
         }
         return hasTriggered;
     }
@@ -139,11 +139,11 @@ public:
                                          + ocl_get_error_string(clError)));
         }
 
-        releaseEvent();
+        reset();
     }
 
-private:
-    inline void releaseEvent()
+    //! Reset (release) the event to unmarked state.
+    inline void reset()
     {
         cl_int clError = clReleaseEvent(event_);
         if (CL_SUCCESS != clError)
@@ -154,6 +154,7 @@ private:
         event_ = nullptr;
     }
 
+private:
     cl_event event_;
 };
 
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
index 6740cc3a3d..cdbed7cc1b 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
@@ -131,6 +131,8 @@ public:
         deviceStream.stream().submit_barrier(waitlist);
         event_.reset();
     }
+    //! Reset the event to unmarked state.
+    inline void reset() { event_.reset(); }
 
 private:
     std::optional<cl::sycl::event> event_ = std::nullopt;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index d269ac27bb..aa5652011b 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -424,14 +424,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
     return shmem;
 }
 
-/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
- *
- *  As the point where the local stream tasks can be considered complete happens
- *  at the same call point where the nonlocal stream should be synced with the
- *  the local, this function records the event if called with the local stream as
- *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
- */
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 
@@ -445,14 +438,11 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal
     {
         if (interactionLocality == InteractionLocality::Local)
         {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
         }
         else
         {
-            cudaError_t stat =
-                    cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
         }
     }
 }
@@ -462,9 +452,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
-               "Only local and non-local xq transfers are supported");
-
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
     int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
@@ -489,6 +476,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     {
         plist->haveFreshList = false;
 
+        // The event is marked for Local interactions unconditionally,
+        // so it has to be released here because of the early return
+        // for NonLocal interactions.
+        nb->misc_ops_and_local_H2D_done.reset();
+
         return;
     }
 
@@ -809,11 +801,14 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    cudaError_t stat;
-    int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
     /* determine interaction locality from atom locality */
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
 
     /* extract the data */
     cu_atomdata_t*      adat         = nb->atdat;
@@ -824,6 +819,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
     {
+        nb->bNonLocalStreamDoneMarked = false;
         return;
     }
 
@@ -837,10 +833,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 
     /* With DD the local D2H transfer can only start after the non-local
        kernel has finished. */
-    if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
     {
-        stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
-        CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
     }
 
     /* DtoH f
@@ -866,8 +862,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
        back first. */
     if (iloc == InteractionLocality::NonLocal)
     {
-        stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
-        CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
     }
 
     /* only transfer energies in the local stream */
@@ -985,7 +981,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
     }
 
-    // TODO: note that this is not necessary when there astreamre no local atoms, that is:
+    // TODO: note that this is not necessary when there are no local atoms, that is:
     // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
     // but for now we avoid that optimization
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index e442fde24f..7d1334144f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -182,8 +182,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
                    const nbnxn_atomdata_t*         nbat,
                    bool                            bLocalAndNonlocal)
 {
-    cudaError_t stat;
-
     auto nb            = new NbnxmGpu();
     nb->deviceContext_ = &deviceStreamManager.context();
     snew(nb->atdat, 1);
@@ -227,14 +225,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
         ;
     }
 
-    /* init events for sychronization (timing disabled for performance reasons!) */
-    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
-    stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
-    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
-
-    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
-
     /* WARNING: CUDA timings are incorrect with multiple streams.
      *          This is the main reason why they are disabled by default.
      */
@@ -409,7 +399,6 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
 void gpu_free(NbnxmGpu* nb)
 {
-    cudaError_t    stat;
     cu_atomdata_t* atdat;
     NBParamGpu*    nbparam;
 
@@ -427,11 +416,6 @@ void gpu_free(NbnxmGpu* nb)
         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
     }
 
-    stat = cudaEventDestroy(nb->nonlocal_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-    stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
-
     delete nb->timers;
 
     if (!useLjCombRule(nb->nbparam->vdwType))
@@ -610,14 +594,19 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
         }
     }
 
-    // The above data is transferred on the local stream but is a
-    // dependency of the nonlocal stream (specifically the nonlocal X
-    // buf ops kernel).  We therefore set a dependency to ensure
-    // that the nonlocal stream waits on the local stream here.
-    // This call records an event in the local stream:
-    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
-    // ...and this call instructs the nonlocal stream to wait on that event:
-    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+    if (gpu_nbv->bUseTwoStreams)
+    {
+        // The above data is transferred on the local stream but is a
+        // dependency of the nonlocal stream (specifically the nonlocal X
+        // buf ops kernel).  We therefore set a dependency to ensure
+        // that the nonlocal stream waits on the local stream here.
+        // This call records an event in the local stream:
+        gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
+                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
+        // ...and this call instructs the nonlocal stream to wait on that event:
+        gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
+                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
+    }
 
     return;
 }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index 7c92a1abdc..1cd1606de3 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -50,6 +50,7 @@
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/nbnxm/gpu_types_common.h"
@@ -142,8 +143,6 @@ struct cu_atomdata
  */
 typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 
-class GpuEventSynchronizer;
-
 /*! \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
@@ -156,6 +155,9 @@ struct NbnxmGpu
     const DeviceContext* deviceContext_;
     /*! \brief true if doing both local/non-local NB work on GPU */
     bool bUseTwoStreams = false;
+    //! true indicates that the nonlocal_done event was marked
+    bool bNonLocalStreamDoneMarked = false;
+
     /*! \brief atom data */
     cu_atomdata_t* atdat = nullptr;
     /*! \brief array of atom indices */
@@ -185,18 +187,15 @@ struct NbnxmGpu
     /*! \brief local and non-local GPU streams */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
-    /*! \brief Events used for synchronization */
-    /*! \{ */
     /*! \brief Event triggered when the non-local non-bonded
      * kernel is done (and the local transfer can proceed) */
-    cudaEvent_t nonlocal_done = nullptr;
+    GpuEventSynchronizer nonlocal_done;
     /*! \brief Event triggered when the tasks issued in the local
      * stream that need to precede the non-local force or buffer
      * operation calculations are done (e.g. f buffer 0-ing, local
      * x/q H2D, buffer op initialization in local stream that is
      * required also by nonlocal stream ) */
-    cudaEvent_t misc_ops_and_local_H2D_done = nullptr;
-    /*! \} */
+    GpuEventSynchronizer misc_ops_and_local_H2D_done;
 
     /*! \brief True if there is work for the current domain in the
      * respective locality.
@@ -208,10 +207,6 @@ struct NbnxmGpu
      * will be true. */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
 
-    /*! \brief Event triggered when non-local coordinate buffer
-     * has been copied from device to host. */
-    GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
-
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 97827a6367..96714bb154 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -249,9 +249,4 @@ void nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu() const
     Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
 }
 
-void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocality interactionLocality) const
-{
-    Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
-}
-
 /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index b7c3101799..960e6c2b39 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -345,9 +345,6 @@ public:
     //! Init for GPU version of setup coordinates in Nbnxm
     void atomdata_init_copy_x_to_nbat_x_gpu() const;
 
-    //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
-    void insertNonlocalGpuDependency(gmx::InteractionLocality interactionLocality) const;
-
     //! Returns a reference to the pairlist sets
     const PairlistSets& pairlistSets() const { return *pairlistSets_; }
 
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 034b0f6562..fe83696934 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -260,12 +260,18 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                            int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
 
 /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ *  As the point where the local stream tasks can be considered complete happens
+ *  at the same call point where the nonlocal stream should be synced with the
+ *  the local, this function records the event if called with the local stream as
+ *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ *
  * \param[in] nb                   The nonbonded data GPU structure
  * \param[in] interactionLocality  Local or NonLocal sync point
  */
-CUDA_FUNC_QUALIFIER
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu gmx_unused* nb,
-                                      gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM;
+GPU_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu gmx_unused* nb,
+                                      gmx::InteractionLocality gmx_unused interactionLocality) GPU_FUNC_TERM;
 
 /*! \brief Set up internal flags that indicate what type of short-range work there is.
  *
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 165079bce4..16cd265d84 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -489,24 +489,35 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
     nbparams_params->vdw_switch        = nbp->vdw_switch;
 }
 
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
-    cl_int gmx_unused cl_error;
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 
-    /* Enqueue wait */
-    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 
-    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
-    cl_error = clReleaseEvent(*ocl_event);
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
-    *ocl_event = nullptr;
+            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+             * in the local stream in order to be able to sync with the above event
+             * from the non-local stream.
+             */
+            cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
+            GMX_ASSERT(cl_error == CL_SUCCESS,
+                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
+        }
+        else
+        {
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+        }
+    }
 }
 
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
@@ -539,6 +550,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     {
         plist->haveFreshList = false;
 
+        // The event is marked for Local interactions unconditionally,
+        // so it has to be released here because of the early return
+        // for NonLocal interactions.
+        nb->misc_ops_and_local_H2D_done.reset();
+
         return;
     }
 
@@ -576,31 +592,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
     }
 
-    /* When we get here all misc operations issues in the local stream as well as
+    /* When we get here all misc operations issued in the local stream as well as
        the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
-        }
-    }
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
 }
 
 
@@ -944,6 +942,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 
     /* determine interaction locality from atom locality */
     const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
 
     cl_atomdata_t*      adat         = nb->atdat;
     cl_timers_t*        t            = nb->timers;
@@ -955,13 +957,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     {
         /* TODO An alternative way to signal that non-local work is
            complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamActive has the
+           pair. However, the use of bNonLocalStreamDoneMarked has the
            advantage of being local to the host, so probably minimizes
            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
            test case, overall simulation performance was higher with
            the API calls, but this has not been tested on AMD OpenCL,
            so could be worth considering in future. */
-        nb->bNonLocalStreamActive = CL_FALSE;
+        nb->bNonLocalStreamDoneMarked = false;
         return;
     }
 
@@ -975,9 +977,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 
     /* With DD the local D2H transfer can only start after the non-local
        has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
     {
-        sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
     }
 
     /* DtoH f */
@@ -1001,10 +1004,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
        data back first. */
     if (iloc == InteractionLocality::NonLocal)
     {
-        cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
-        GMX_ASSERT(cl_error == CL_SUCCESS,
-                   ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-        nb->bNonLocalStreamActive = CL_TRUE;
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
     }
 
     /* only transfer energies in the local stream */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index a70b2b8a71..7472b37c70 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -633,18 +633,6 @@ void gpu_free(NbnxmGpu* nb)
     pfree(nb->nbst.fshift);
     nb->nbst.fshift = nullptr;
 
-    /* Free other events */
-    if (nb->nonlocal_done)
-    {
-        clReleaseEvent(nb->nonlocal_done);
-        nb->nonlocal_done = nullptr;
-    }
-    if (nb->misc_ops_and_local_H2D_done)
-    {
-        clReleaseEvent(nb->misc_ops_and_local_H2D_done);
-        nb->misc_ops_and_local_H2D_done = nullptr;
-    }
-
     freeGpuProgram(nb->dev_rundata->program);
     delete nb->dev_rundata;
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index d1adddb205..2e7098227d 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@
 
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/mdtypes/interaction_const.h"
@@ -240,8 +241,8 @@ struct NbnxmGpu
 
     //! true if doing both local/non-local NB work on GPU
     bool bUseTwoStreams = false;
-    //! true indicates that the nonlocal_done event was enqueued
-    bool bNonLocalStreamActive = false;
+    //! true indicates that the nonlocal_done event was marked
+    bool bNonLocalStreamDoneMarked = false;
 
     //! atom data
     cl_atomdata_t* atdat = nullptr;
@@ -259,13 +260,13 @@ struct NbnxmGpu
     /*! \{ */
     /*! \brief Event triggered when the non-local non-bonded
      * kernel is done (and the local transfer can proceed) */
-    cl_event nonlocal_done = nullptr;
+    GpuEventSynchronizer nonlocal_done;
     /*! \brief Event triggered when the tasks issued in the local
      * stream that need to precede the non-local force or buffer
      * operation calculations are done (e.g. f buffer 0-ing, local
      * x/q H2D, buffer op initialization in local stream that is
      * required also by nonlocal stream ) */
-    cl_event misc_ops_and_local_H2D_done = nullptr;
+    GpuEventSynchronizer misc_ops_and_local_H2D_done;
     /*! \} */
 
     //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled