Balance event consumption for GPU update code path

[alexxy/gromacs.git] / src / gromacs / mdlib / gpuforcereduction_impl.cpp
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index fb58c5c9437ef48cbadfb5ac4c9592d9b4f006ea..3de3002b3ac3c72a6b79de09e1b59b92d8791543 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -47,11 +47,7 @@
  
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
-#if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#elif GMX_GPU_SYCL
-#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
-#endif
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
  #include "gromacs/mdlib/gpuforcereduction_impl_internal.h"
  #include "gromacs/utility/gmxassert.h"
  
@@ -77,11 +73,11 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
                                       const bool            accumulate,
                                       GpuEventSynchronizer* completionMarker)
  {
-    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    GMX_ASSERT(baseForcePtr, "Input base force for reduction has no data");
      baseForce_        = baseForcePtr;
      numAtoms_         = numAtoms;
      atomStart_        = atomStart;
-    accumulate_       = static_cast<int>(accumulate);
+    accumulate_       = accumulate;
      completionMarker_ = completionMarker;
      cellInfo_.cell    = cell.data();
  
@@ -112,7 +108,7 @@ void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
      rvecForceToAdd_ = forcePtr;
  };
  
-void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency)
  {
      dependencyList_.push_back(dependency);
  }
@@ -122,32 +118,43 @@ void GpuForceReduction::Impl::execute()
      wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
      wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
  
-    if (numAtoms_ == 0)
+    if (numAtoms_ != 0)
      {
-        return;
+        GMX_ASSERT(nbnxmForceToAdd_, "Nbnxm force for reduction has no data");
+
+        // Enqueue wait on all dependencies passed
+        for (auto* synchronizer : dependencyList_)
+        {
+            synchronizer->enqueueWaitEvent(deviceStream_);
+        }
+
+        const bool addRvecForce = static_cast<bool>(rvecForceToAdd_); // True iff initialized
+
+        launchForceReductionKernel(numAtoms_,
+                                   atomStart_,
+                                   addRvecForce,
+                                   accumulate_,
+                                   nbnxmForceToAdd_,
+                                   rvecForceToAdd_,
+                                   baseForce_,
+                                   cellInfo_.d_cell,
+                                   deviceStream_);
      }
-
-    GMX_ASSERT(nbnxmForceToAdd_, "Nbnxm force for reduction has no data");
-
-    // Enqueue wait on all dependencies passed
-    for (auto* synchronizer : dependencyList_)
+    else
      {
-        synchronizer->enqueueWaitEvent(deviceStream_);
+        /* In case we have nothing to do, but still have dependencies, we need
+         * to consume them and mark our own event.
+         * Happens sometimes in MdrunVsitesTest.
+         * Issue #3988, #4227. */
+        for (auto* synchronizer : dependencyList_)
+        {
+            synchronizer->consume();
+        }
      }
  
-    const bool addRvecForce = static_cast<bool>(rvecForceToAdd_); // True iff initialized
-
-    launchForceReductionKernel(numAtoms_,
-                               atomStart_,
-                               addRvecForce,
-                               accumulate_,
-                               nbnxmForceToAdd_,
-                               rvecForceToAdd_,
-                               baseForce_,
-                               cellInfo_.d_cell,
-                               deviceStream_);
-
-    // Mark that kernel has been launched
+    /* Mark that kernel has been launched.
+     * Even if we have no work to do and have not launched the kernel, we still mark the event
+     * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */
      if (completionMarker_ != nullptr)
      {
          completionMarker_->markEvent(deviceStream_);
@@ -157,8 +164,6 @@ void GpuForceReduction::Impl::execute()
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-GpuForceReduction::Impl::~Impl() = default;
-
  GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
                                       const DeviceStream&  deviceStream,
                                       gmx_wallcycle*       wcycle) :
@@ -176,7 +181,7 @@ void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
      impl_->registerRvecForce(forcePtr);
  }
  
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency)
  {
      impl_->addDependency(dependency);
  }