Balance event consumption for GPU update code path

[alexxy/gromacs.git] / src / gromacs / mdlib / gpuforcereduction_impl.cpp
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index 69876c7d15711ad318c4c193cfd09d25e66639c1..3de3002b3ac3c72a6b79de09e1b59b92d8791543 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -34,7 +34,7 @@
   */
  /*! \internal \file
   *
- * \brief May be used to implement force reduction interfaces for non-GPU builds.
+ * \brief Implements backend-agnostic GPU Force Reduction functions
   *
   * \author Alan Gray <alang@nvidia.com>
   *
@@ -43,64 +43,163 @@
  
  #include "gmxpre.h"
  
-#include "config.h"
+#include "gpuforcereduction_impl.h"
  
-#include "gpuforcereduction.h"
-
-#if !GMX_GPU_CUDA
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
+#include "gromacs/mdlib/gpuforcereduction_impl_internal.h"
+#include "gromacs/utility/gmxassert.h"
  
  namespace gmx
  {
  
-class GpuForceReduction::Impl
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+                              const DeviceStream&  deviceStream,
+                              gmx_wallcycle*       wcycle) :
+    baseForce_(),
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
+    nbnxmForceToAdd_(),
+    rvecForceToAdd_(),
+    wcycle_(wcycle)
+{
+}
+
+void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
+{
+    GMX_ASSERT(baseForcePtr, "Input base force for reduction has no data");
+    baseForce_        = baseForcePtr;
+    numAtoms_         = numAtoms;
+    atomStart_        = atomStart;
+    accumulate_       = accumulate;
+    completionMarker_ = completionMarker;
+    cellInfo_.cell    = cell.data();
+
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    reallocateDeviceBuffer(
+            &cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_);
+    copyToDeviceBuffer(&cellInfo_.d_cell,
+                       &(cellInfo_.cell[atomStart]),
+                       0,
+                       numAtoms_,
+                       deviceStream_,
+                       GpuApiCallBehavior::Async,
+                       nullptr);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+
+    dependencyList_.clear();
+};
+
+void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+{
+    GMX_ASSERT(forcePtr, "Input force for reduction has no data");
+    nbnxmForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
  {
+    GMX_ASSERT(forcePtr, "Input force for reduction has no data");
+    rvecForceToAdd_ = forcePtr;
  };
  
-GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
-                                     const DeviceStream& /* deviceStream */,
-                                     gmx_wallcycle* /*wcycle*/) :
-    impl_(nullptr)
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency)
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    dependencyList_.push_back(dependency);
  }
  
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
-                               const int /*numAtoms*/,
-                               ArrayRef<const int> /*cell*/,
-                               const int /*atomStart*/,
-                               const bool /*accumulate*/,
-                               GpuEventSynchronizer* /*completionMarker*/)
+void GpuForceReduction::Impl::execute()
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+
+    if (numAtoms_ != 0)
+    {
+        GMX_ASSERT(nbnxmForceToAdd_, "Nbnxm force for reduction has no data");
+
+        // Enqueue wait on all dependencies passed
+        for (auto* synchronizer : dependencyList_)
+        {
+            synchronizer->enqueueWaitEvent(deviceStream_);
+        }
+
+        const bool addRvecForce = static_cast<bool>(rvecForceToAdd_); // True iff initialized
+
+        launchForceReductionKernel(numAtoms_,
+                                   atomStart_,
+                                   addRvecForce,
+                                   accumulate_,
+                                   nbnxmForceToAdd_,
+                                   rvecForceToAdd_,
+                                   baseForce_,
+                                   cellInfo_.d_cell,
+                                   deviceStream_);
+    }
+    else
+    {
+        /* In case we have nothing to do, but still have dependencies, we need
+         * to consume them and mark our own event.
+         * Happens sometimes in MdrunVsitesTest.
+         * Issue #3988, #4227. */
+        for (auto* synchronizer : dependencyList_)
+        {
+            synchronizer->consume();
+        }
+    }
+
+    /* Mark that kernel has been launched.
+     * Even if we have no work to do and have not launched the kernel, we still mark the event
+     * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */
+    if (completionMarker_ != nullptr)
+    {
+        completionMarker_->markEvent(deviceStream_);
+    }
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+                                     const DeviceStream&  deviceStream,
+                                     gmx_wallcycle*       wcycle) :
+    impl_(new Impl(deviceContext, deviceStream, wcycle))
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
  }
  
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->registerNbnxmForce(forcePtr);
  }
  
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */)
+void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->registerRvecForce(forcePtr);
  }
  
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency)
+{
+    impl_->addDependency(dependency);
+}
+
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
+                               const int             numAtoms,
+                               ArrayRef<const int>   cell,
+                               const int             atomStart,
+                               const bool            accumulate,
+                               GpuEventSynchronizer* completionMarker)
+{
+    impl_->reinit(baseForcePtr, numAtoms, cell, atomStart, accumulate, completionMarker);
+}
  void GpuForceReduction::execute()
  {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->execute();
  }
  
  GpuForceReduction::~GpuForceReduction() = default;
  
  } // namespace gmx
-
-#endif